# Machine learning: project
# Predicting depression from social media posts
## Liisa Jullinen

This is the work that our intermediate presentation was based on. The data is from the Reddit Mental Health dataset (https://zenodo.org/records/3941387). We use r/depression and r/fitness (as opposed to the original choice r/personalfinance, that was not as neutral).

Other changes regard the selected features. Firstly, we omit df-idf features, because these predict more the subforum subject matter and less the general way people use language. Secondly, we add some new features (that concentrate on depression-specific language use).

Training. 2 models are trained and evaluated: logistic regression and SVM.

### Preprocessing and setup

In [52]:
#!conda install -c conda-forge transformers datasets pytorch scikit-learn -yimport numpy as np

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
#get data from zenodo
!wget -O fitness_pre_features_tfidf_256.csv "https://zenodo.org/records/3941387/files/fitness_pre_features_tfidf_256.csv?download=1"
!wget -O depression_pre_features_tfidf_256.csv "https://zenodo.org/records/3941387/files/depression_pre_features_tfidf_256.csv?download=1"



--2025-12-10 11:26:14--  https://zenodo.org/records/3941387/files/fitness_pre_features_tfidf_256.csv?download=1
Resolving zenodo.org (zenodo.org)... 137.138.52.235, 188.185.48.75, 188.185.43.153, ...
Connecting to zenodo.org (zenodo.org)|137.138.52.235|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 54592214 (52M) [text/plain]
Saving to: ‘fitness_pre_features_tfidf_256.csv’


In [None]:
# read in data
df1 = pd.read_csv("depression_pre_features_tfidf_256.csv", sep=",")
df2 = pd.read_csv("fitness_pre_features_tfidf_256.csv", sep=",")

In [None]:
#df1.info()
#df1.head(3)
list(df1)

In [None]:
 'liwc_achievement', 'liwc_biological',
 'liwc_body', 'liwc_death','liwc_family',
'liwc_friends', 'liwc_health',  'liwc_home',
 'liwc_humans', 'liwc_ingestion',  'liwc_leisure',
 'liwc_money', 'liwc_motion', 'liwc_religion',
 'liwc_sexual', 'liwc_work',

In [None]:
# add dfs
df_ = pd.concat([df1, df2], ignore_index=True)
# create label col as 'depressed': 1 depressed, 0 non
df_['depressed'] = df_['subreddit'].map({'depression': 1, 'fitness': 0})
df_=df_.drop(columns=["subreddit"])
# drop tfidf cols (not interested in these)
df_ = df_.drop(columns=df_.filter(regex=r"^tfidf").columns)

In [None]:
# adding a feature that counts absolutist words
words_custom = ["always", "never", "entire", "totally"]

def count_absolutist_words(text, words_custom):
    words = re.findall(r'\b\w+\b', text.lower())
    return sum(w in words_custom for w in words)

df_["absolutist"] = df_["post"].apply(lambda x: count_absolutist_words(x, words_custom))

In [None]:
# seeing if depressed/non-depressed use these word significantly differently

df_.head()
df_.groupby("depressed")["absolutist"].mean()
group1 = df_.loc[df_["depressed"] == 0, "absolutist"]
group2 = df_.loc[df_["depressed"] == 1, "absolutist"]

t_stat, p_val = ttest_ind(group1, group2, nan_policy="omit")

t_stat, p_val

In [None]:
# some posts are really long, and thus have very many of these words
#df_["absolutist"].unique().max()
#df_.loc[df_["absolutist"].idxmax(), "post"]
#list(df_)

In [None]:
# Thus I'm calculating a feature showing the % of these words out of all words in a post
def absolutist_percentage(text, n_words):
    if n_words == 0:
        return 0
    words = text.lower().split()
    count = sum(1 for w in words if w in words_custom)
    return (count / n_words) * 100

df_["absolutist_pct"] = df_.apply(lambda row: absolutist_percentage(row["post"], row["n_words"]), axis=1)

In [None]:
# Seeing again if groups differ in this stat
df_.head()
df_.groupby("depressed")["absolutist_pct"].mean()

group1 = df_.loc[df_["depressed"] == 0, "absolutist_pct"]
group2 = df_.loc[df_["depressed"] == 1, "absolutist_pct"]

t_stat, p_val = ttest_ind(group1, group2, nan_policy="omit")

print(t_stat)
print("{:.12f}".format(p_val))

In [None]:
#shuffling classes and removing the "post" and other features irrelevant for modelling
df_ = df_.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
cols_drop = [
 'economic_stress_total',
 'isolation_total',
 'substance_use_total',
 'guns_total',
 'domestic_stress_total',
 'suicidality_total',
 'author',
 'date',
 'post',
  'liwc_achievement', 'liwc_biological',
 'liwc_body', 'liwc_death','liwc_family',
'liwc_friends', 'liwc_health',  'liwc_home',
 'liwc_humans', 'liwc_ingestion',  'liwc_leisure',
 'liwc_money', 'liwc_motion', 'liwc_religion',
 'liwc_sexual', 'liwc_work',]

df = df_.drop(columns=cols_drop)


##MODELLING

In [None]:
# train, test, validation
train_val, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['depressed'])
train, val = train_test_split(train_val, test_size=0.25, random_state=42, stratify=train_val['depressed'])

In [None]:
print("Train:", len(train), " Validation:", len(val), " Test:", len(test))
print("Label distribution:")
print(train['depressed'].value_counts(normalize=True))
print(val['depressed'].value_counts(normalize=True))
print(test['depressed'].value_counts(normalize=True))

In [None]:
target = 'depressed'

X_train = train.drop(columns=[target])
y_train = train[target]

X_val = val.drop(columns=[target])
y_val = val[target]

X_test = test.drop(columns=[target])
y_test = test[target]

In [None]:
scaler = StandardScaler()


X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train_scaled, y_train)

In [None]:

y_val_pred_lr = clf.predict(X_val_scaled)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred_lr))
print(classification_report(y_val, y_val_pred_lr))


In [None]:
y_test_pred = clf.predict(X_test_scaled)

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [None]:


feature_names = X_train.columns
coefs = clf.coef_[0]


coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefs,
    'abs_coefficient': np.abs(coefs)
})


coef_df_sorted = coef_df.sort_values(by='abs_coefficient', ascending=False)

print(coef_df_sorted.head(60))


In [None]:


top_n = 20
top_features = coef_df_sorted.head(top_n).copy()


top_features['sign'] = top_features['coefficient'].apply(lambda x: 'positive' if x > 0 else 'negative')


plt.figure(figsize=(10, 8))
sns.barplot(
    x='coefficient',
    y='feature',
    data=top_features,
    hue='sign',
    dodge=False,
    palette={'positive':'red', 'negative':'blue'}
)
plt.legend([],[], frameon=False)
plt.title(f'Top {top_n} Influential Features (Logistic Regression)')
plt.xlabel('Coefficient (positive → class 1, negative → class 0)')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
# pipeline: scale -> train SVM
svm_clf = make_pipeline(
    StandardScaler(),
    SVC(kernel='linear', C=1, random_state=42)
)


svm_clf.fit(X_train, y_train)

In [None]:
y_val_pred = svm_clf.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))


In [None]:
y_test_pred = svm_clf.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [None]:

def get_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-score": f1_score(y_true, y_pred)
    }


metrics_val_lr = get_metrics(y_val, y_val_pred_lr)
metrics_test_lr = get_metrics(y_test, y_test_pred)


metrics_val_svm = get_metrics(y_val, y_val_pred)
metrics_test_svm = get_metrics(y_test, y_test_pred)


results_df = pd.DataFrame({
    "Validation (LR)": metrics_val_lr,
    "Test (LR)": metrics_test_lr,
    "Validation (SVM)": metrics_val_svm,
    "Test (SVM)": metrics_test_svm
})


results_df = results_df.round(3)

print(results_df)

In [None]:

def get_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-score": f1_score(y_true, y_pred)
    }


results = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score"],
    "Validation (LogisticRegression)": list(get_metrics(y_val, y_val_pred_lr).values()),
    "Test (LogisticRegression)": list(get_metrics(y_test, y_test_pred).values()),
    "Validation (SVM)": list(get_metrics(y_val, y_val_pred).values()),
    "Test (SVM)": list(get_metrics(y_test, y_test_pred).values())
}

results_df = pd.DataFrame(results)
results_df.iloc[:, 1:] = results_df.iloc[:, 1:].round(3)
print(results_df)

# Export to CSV
results_df.to_csv("model_comparison_results.csv", index=False)

# Export to Excel
results_df.to_excel("model_comparison_results.xlsx", index=False)