In [1]:
import pandas as pd

# Load CSV
file_path = "/mnt/nasdrive/shared/raw_data/ImageTrend Emails/__2025_06_25_115054.csv"
df = pd.read_csv(file_path, dtype=str, engine='python', on_bad_lines='skip')  # Load all as string initially for safety

# Normalize column names: strip, lowercase, replace spaces and dots
df.columns = df.columns.str.strip().str.lower().str.replace('.', '_').str.replace(' ', '_')

# Trim whitespace from string columns
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()

# Preview loaded data
print("Columns loaded:", df.columns.tolist())
print("Data sample:")
print(df.head())

Columns loaded: ['message_sent_date_time', 'response_ems_response_number_(eresponse_04)', 'message_recipient_list', 'message_sender_full_name', 'message_in_reply_to_subject', 'message_subject', 'message_text', 'message_type_description']
Data sample:
  message_sent_date_time response_ems_response_number_(eresponse_04)  \
0    1/8/2025 1:57:16 AM                                      25-944   
1    1/8/2025 1:57:41 AM                                      25-944   
2    1/8/2025 3:01:01 AM                                      25-973   
3   1/15/2025 7:18:53 PM                                     25-1067   
4   1/16/2025 8:16:24 PM                                     25-2151   

                           message_recipient_list message_sender_full_name  \
0  Matthew Hennig, Jonathan Taft, Garth McChesney            Jonathan Taft   
1                   Jonathan Taft, Matthew Hennig            Jonathan Taft   
2         Tallon Joyce, Brian Rafe, Jonathan Taft            Jonathan Taft   
3   

In [2]:
import pandas as pd

file_path = "/mnt/nasdrive/shared/raw_data/ImageTrend Emails/__2025_06_25_115054.csv"
df = pd.read_csv(file_path, dtype=str, engine='python', on_bad_lines='skip')  # Load all as string initially for safety

# Normalize columns
df.columns = df.columns.str.strip().str.lower().str.replace('.', '_').str.replace(' ', '_')

# Convert message sent date/time to datetime
df['message_sent_date_time'] = pd.to_datetime(df['message_sent_date_time'], errors='coerce')

# Extract year-month period for grouping
df['month'] = df['message_sent_date_time'].dt.to_period('M')

In [3]:
# Define keywords for categories - customize this list based on your typical emails
issue_categories = {
    'qa_review': ['qa review'],
    'cqi_alert': ['cqi alert'],
    'documentation': ['documentation', 'doc error', 'missing'],
    'refusal': ['refusal'],
    'times': ['time', 'times', 'delay'],
    'procedures': ['procedure', 'airway', 'medication', 'meds'],
    'narrative': ['narrative', 'report', 'notes'],
    'controlled_substances': ['controlled substance', 'drug', 'narcotic']
}

# Initialize columns for categories
for category in issue_categories:
    df[category] = 0

# Lowercase subject and message text for searching
df['message_subject_lower'] = df['message_subject'].fillna('').str.lower()
df['message_text_lower'] = df['message_text'].fillna('').str.lower()

# Flag messages containing any keyword from each category in either subject or text
for category, keywords in issue_categories.items():
    df[category] = df.apply(
        lambda row: any(k in row['message_subject_lower'] or k in row['message_text_lower'] for k in keywords),
        axis=1
    )

# Aggregate counts by month and issue category
summary = df.groupby('month')[list(issue_categories.keys())].sum().reset_index()

print(summary)

     month  qa_review  cqi_alert  documentation  refusal  times  procedures  \
0  2025-01          1         23             13        0      7           3   
1  2025-02          0         26            155        7     18          23   
2  2025-03          0         22            187       11     29          28   
3  2025-04          0          3             80        3      5          13   
4  2025-05          0         51             66       15     11          22   
5  2025-06          0         22             10        3      9           5   

   narrative  controlled_substances  
0         11                      0  
1         81                      0  
2        129                      1  
3        105                      0  
4        137                      0  
5         56                      0  


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Assuming df is your original DataFrame loaded and cleaned

# Normalize subjects for threading (if not done)
df['subject_normalized'] = df['message_subject'].fillna('').str.strip().str.lower()
df['in_reply_to_normalized'] = df['message_in_reply_to_subject'].fillna('').str.strip().str.lower()
df['thread_subject'] = df.apply(
    lambda row: row['in_reply_to_normalized'] if row['in_reply_to_normalized'] not in ['', '(blank)'] else row['subject_normalized'],
    axis=1
)

# Combine all messages text per thread + EMS response number
df_threaded = df.groupby(['thread_subject', 'response_ems_response_number_(eresponse_04)']).agg({
    'message_text': lambda texts: ' '.join(texts.fillna('')),
    'documentation': 'max',
    'narrative': 'max'
}).reset_index()

# Filter for threads flagged as documentation or narrative issues
flagged_threads = df_threaded[(df_threaded['documentation'] == 1) | (df_threaded['narrative'] == 1)]

print(f"Number of flagged threads: {len(flagged_threads)}")

Number of flagged threads: 597


In [5]:
# Vectorize combined message text
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = vectorizer.fit_transform(flagged_threads['message_text'])

# Apply NMF for topic modeling
num_topics = 5  # You can adjust this
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf_matrix)

# Function to display top words per topic
def display_topics(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
        print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
    return topics

topics = display_topics(nmf_model, vectorizer.get_feature_names_out(), 10)

Topic 1: br, jonathan, sent, narrative, 2025, subject, hr, taft, foster, diane
Topic 2: br, edits, complete, lock, status, making, change, good, report, lindsey
Topic 3: missing, crew, signatures, signature, member, 2nd, personnel, unit, update, paramedic
Topic 4: mileage, incomplete, update, mile, lee, wendy, missing, march, 13, updated
Topic 5: certs, flight, missing, handle, aaron, mathrole, emails, 17, signature, issues
