In [2]:
import os
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


In [3]:
"""
    Working code to read in every .csv line by line
    This inclues ONLY E-DAIC_Transcripts NOT TranscriptsOld, just remove root == "data/TranscriptsOld" if we want to change that
"""

all_csvs = []
for root, dirs, files in os.walk("data"):
    for file in files:
        if not file.endswith(".csv") or file == "test.csv" or root == "data/TranscriptsOld":
            continue  # ignore .xlsx and test.csv and old transcripts
        full_path = os.path.join(root, file)
        df = pd.read_csv(full_path, engine='python', on_bad_lines='skip')
        df['source_file'] = file # Which conversation (ie) xxx_Transcript.csv
        all_csvs.append(df)

combined_df = pd.concat(all_csvs, ignore_index=True)

print("Combined shape:", combined_df.shape)
combined_df.head(10)

Combined shape: (16681, 2)


Unnamed: 0,Text,source_file
0,okay,423_Transcript.csv
1,and please,423_Transcript.csv
2,yes,423_Transcript.csv
3,feeling well,423_Transcript.csv
4,where are you from originally Los Angeles the...,423_Transcript.csv
5,people,423_Transcript.csv
6,diversity and,423_Transcript.csv
7,various entertainment and activities,423_Transcript.csv
8,fickle weather,423_Transcript.csv
9,traffic and litter,423_Transcript.csv


In [4]:
"""
    I think we only want this file The other one has things that we dont need for our work. 
"""
df2 = pd.read_excel("data/DAIC demographic data.xlsx", sheet_name="Interview_Data")
df2.head()

Unnamed: 0,Partic#,Condition,gender,race
0,Participant Number,Condition,What is your gender?,What is your race?
1,302,WoZ,1,1
2,303,WoZ,2,1
3,304,WoZ,2,1
4,305,WoZ,1,4


In [5]:
"""
    Here we merge the text (from the csv) with the patient information from the xlsx
    As we see from the assert this does not add or drop any rows but we do prune the cols to get nice df to work with from here
"""
combined_df['participant_id'] = combined_df['source_file'].str.extract(r"(\d+)_Transcript\.csv")
combined_df.head()
df2['participant_id'] = df2['Partic#'].astype(str).str.strip()
merged_df = pd.merge(combined_df, df2, on='participant_id', how='left')
merged_df = merged_df.drop(columns=['Partic#', 'source_file'])
assert(combined_df.shape[0] == merged_df.shape[0])
merged_df.head(10)



Unnamed: 0,Text,participant_id,Condition,gender,race
0,okay,423,WoZ,2,7
1,and please,423,WoZ,2,7
2,yes,423,WoZ,2,7
3,feeling well,423,WoZ,2,7
4,where are you from originally Los Angeles the...,423,WoZ,2,7
5,people,423,WoZ,2,7
6,diversity and,423,WoZ,2,7
7,various entertainment and activities,423,WoZ,2,7
8,fickle weather,423,WoZ,2,7
9,traffic and litter,423,WoZ,2,7


In [None]:
"""
    We amalgamate all the text for each 190 participants 
    "df" is now the working data frame 
"""
df = merged_df.groupby('participant_id').agg({
    'Text': ' '.join,        
    'race': 'first',
    'gender': 'first',
    'Condition': 'first'
}).reset_index()

print(df.shape)
df.head()

(190, 5)


Unnamed: 0,participant_id,Text,race,gender,Condition
0,386,might have pulled something that I'm going to...,3,2,WoZ
1,387,when she's done she'll let you know alrighty ...,1,1,WoZ
2,388,are you okay with yes doing all right from ...,4,1,WoZ
3,389,and please are you okay sure I'm okay smal...,1,1,WoZ
4,390,and now she's going to chat with you for a bit...,3,1,WoZ


In [None]:
# Just a quick glance at the data
all_text = ' '.join(df['Text'].tolist())
words = all_text.split()
unique_words = set(words)
word_count = Counter(words)
print("Total words:", len(words))
print("Unique Words:", len(unique_words))
print("Top 10 most common words:", word_count.most_common(10))

Total words: 210959
Unique Words: 9348
Top 10 most common words: [('I', 12774), ('and', 7208), ('to', 6299), ('the', 5428), ('a', 5166), ('that', 3927), ('you', 3782), ('of', 3525), ('my', 3474), ('it', 2779)]


In [None]:
"""
    This code trains a Random Forest Classifier to predict the race of a participant based on the words they used.
    We use the chi^2 metric to select the m most relevant words for our prediction:  selector = SelectKBest(score_func=chi2, k=m)
    We slit the data into 5 partitions: five_fold_spit = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    Then we use a random forest classifier with 100 trees to predict the race of the participant based on their diction
    The best m is m = 1000 with accuracy of 0.764039, if we presume random guessing between 3 classes would achive 33% accuracy, this is meaningfully better

    * Note * The actual number of unique words according the classification library is 7083, this is because of case sensitivity and other ways the library drops words
"""
# Only look at the races we are considering --> 1: White, 2: Black, 3: Hispanic
df_clean = df[df['race'].isin([1, 2, 3])]  
df_clean = df_clean.reset_index(drop=True)

X_text = df_clean['Text']
y = df_clean['race'].astype(int)  

# Vectorize the text
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_vect = vectorizer.fit_transform(X_text)

results = []

m_values = [50, 100, 300, 500, 1000, 7083]
for m in m_values:
    print(f"Testing m = {m} features")
    
    # Feature selection using chi2
    selector = SelectKBest(score_func=chi2, k=m)
    X_selected = selector.fit_transform(X_vect, y)
    
    # Cross-validation setup
    five_fold_spit = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accs = []
    bal_accs = []
    
    for train_idx, test_idx in five_fold_spit.split(X_selected, y):
        X_train, X_test = X_selected[train_idx], X_selected[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train Random Forest
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Store metrics
        accs.append(accuracy_score(y_test, y_pred))
        bal_accs.append(balanced_accuracy_score(y_test, y_pred))

    results.append({
        'm': m,
        'accuracy': np.mean(accs),
        'balanced_accuracy': np.mean(bal_accs)
    })

results_df = pd.DataFrame(results)
print("\nTree Results:")
print(results_df)


Testing m = 50 features
Testing m = 100 features
Testing m = 300 features
Testing m = 500 features
Testing m = 1000 features
Testing m = 7083 features

Tree Results:
      m  accuracy  balanced_accuracy
0    50  0.527833           0.333283
1   100  0.548768           0.352348
2   300  0.626355           0.435303
3   500  0.750246           0.515101
4  1000  0.764039           0.517929
5  7083  0.604680           0.391742


In [15]:
"""
    Pretty much the same here, only we use MLPClassifier this time, just a standard mpl network
    We get a really good accuracy at m = 1000 of 0.930788
"""

df_clean = df[df['race'].isin([1, 2, 3])].reset_index(drop=True)
X_text = df_clean['Text']
y = df_clean['race'].astype(int)

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_vect = vectorizer.fit_transform(X_text)

m_values = [50, 100, 300, 500, 1000, 7083]
results = []

for m in m_values:
    print(f"Testing m = {m} features")

    selector = SelectKBest(score_func=chi2, k=m)
    X_selected = selector.fit_transform(X_vect, y)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accs = []
    bal_accs = []

    for train_idx, test_idx in skf.split(X_selected, y):
        X_train, X_test = X_selected[train_idx], X_selected[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = MLPClassifier(hidden_layer_sizes=(100), max_iter=500, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Store metrics
        accs.append(accuracy_score(y_test, y_pred))
        bal_accs.append(balanced_accuracy_score(y_test, y_pred))

    results.append({
        'm': m,
        'accuracy': np.mean(accs),
        'balanced_accuracy': np.mean(bal_accs)
    })

results_df = pd.DataFrame(results)
print("\nDeep Learning Results:")
print(results_df)


Testing m = 50 features




Testing m = 100 features




Testing m = 300 features




Testing m = 500 features




Testing m = 1000 features
Testing m = 7083 features

Deep Learning Results:
      m  accuracy  balanced_accuracy
0    50  0.555665           0.350227
1   100  0.548768           0.346061
2   300  0.633005           0.435202
3   500  0.854187           0.599217
4  1000  0.930788           0.650000
5  7083  0.652956           0.438030
