In [None]:
import pickle
with open('/content/drive/MyDrive/Colab_Notebooks/MDDdataset/symptom_sum_top16/train.pkl', 'rb') as f:
      raw_train_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab_Notebooks/MDDdataset/symptom_sum_top16/test.pkl', 'rb') as f:
      raw_test_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab_Notebooks/MDDdataset/symptom_sum_top16/val.pkl', 'rb') as f:
      raw_val_data = pickle.load(f)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
train_data=pd.DataFrame(raw_train_data)
test_data=pd.DataFrame(raw_test_data)
val_data=pd.DataFrame(raw_val_data)

In [None]:
df_all = pd.concat([train_data, val_data], axis=0)
df_all = pd.concat([df_all, test_data], axis=0)

In [None]:

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from scipy.sparse import hstack

# Define keyword mapping
keyword_mapping = {
    "Feeling sad or down in the dumps": ["feeling sad", "down in the dumps", "depressed", "unhappy"],
    "Feeling unhappy or blue": ["feeling unhappy", "feeling blue", "downhearted", "gloomy"],
    "Crying spells or tearfulness": ["crying spells", "tearful", "tears", "weepy"],
    "Feeling discouraged": ["feeling discouraged", "disheartened", "hopeless", "pessimistic"],
    "Feeling hopeless": ["feeling hopeless", "despair", "no hope", "futile"],
    "Low self-esteem": ["low self-esteem", "poor self-image", "self-loathing", "insecure"],
    "Feeling worthless or inadequate": ["feeling worthless", "inadequate", "insignificant", "valueless"],
    "Guilt or shame": ["guilt", "shame", "remorseful", "guilty"],
    "Criticizing yourself or blaming others": ["self-criticism", "self-blame", "blaming others", "fault-finding"],
    "Difficulty making decisions": ["indecisive", "difficulty deciding", "hesitant", "uncertain"],
    "Loss of interest in family, friends or colleagues": ["loss of interest", "disinterested", "apathetic", "detached"],
    "Loneliness": ["loneliness", "isolated", "alone", "solitary"],
    "Spending less time with family or friends": ["less social", "avoiding people", "withdrawn", "reclusive"],
    "Loss of motivation": ["loss of motivation", "unmotivated", "lack of drive", "apathetic"],
    "Loss of interest in work or other activities": ["loss of interest in work", "disinterest in activities", "bored", "unengaged"],
    "Avoiding work or other activities": ["avoiding work", "shirking responsibilities", "neglecting tasks", "procrastinating"],
    "Loss of pleasure or satisfaction in life": ["anhedonia", "loss of pleasure", "joyless", "dissatisfaction"],
    "Feeling tired": ["feeling tired", "fatigued", "exhausted", "worn out"],
    "Difficulty sleeping or sleeping too much": ["insomnia", "oversleeping", "sleep disturbances", "restless sleep"],
    "Decreased or increased appetite": ["poor appetite", "overeating", "loss of appetite", "binge eating"],
    "Loss of interest in sex": ["loss of libido", "disinterest in sex", "sexual apathy", "low sex drive"],
    "Worrying about your health": ["health anxiety", "hypochondria", "preoccupied with health", "health worries"],
    "Do you have any suicidal thoughts?": ["suicidal thoughts", "thinking about suicide", "suicidal ideation", "self-harm thoughts"],
    "Would you like to end your life?": ["wanting to end life", "wishing for death", "suicidal desire", "thoughts of dying"],
    "Do you have a plan for harming yourself?": ["suicide plan", "self-harm plan", "planning suicide", "intent to self-harm"]
}

# Function to score a post based on keywords
def score_post(post, keyword_mapping):
    scores = {}
    for item, keywords in keyword_mapping.items():
        scores[item] = any(keyword in post for keyword in keywords)
    return scores

# Load the dataset
train_df = train_data
test_df = test_data

# Join the list of posts into a single string and convert to lowercase
train_df['joined_posts'] = train_df['selected_posts'].apply(lambda posts: ' '.join(posts).lower())
test_df['joined_posts'] = test_df['selected_posts'].apply(lambda posts: ' '.join(posts).lower())

# Apply the scoring function to each post in the training and test data
train_df['scores'] = train_df['joined_posts'].apply(lambda post: score_post(post, keyword_mapping))
test_df['scores'] = test_df['joined_posts'].apply(lambda post: score_post(post, keyword_mapping))

# Convert scores to DataFrame
train_scores_df = pd.DataFrame(train_df['scores'].tolist())
test_scores_df = pd.DataFrame(test_df['scores'].tolist())

# Create binary labels from the 'diseases' column
train_df['depression_label'] = train_df['diseases'].apply(lambda diseases: 1 if 'depression' in diseases else 0)
test_df['depression_label'] = test_df['diseases'].apply(lambda diseases: 1 if 'depression' in diseases else 0)

# Check the distribution of labels
print(train_df['depression_label'].value_counts())
print(test_df['depression_label'].value_counts())

# Extract text features using CountVectorizer
vectorizer = CountVectorizer()
X_train_text = vectorizer.fit_transform(train_df['joined_posts'])
X_test_text = vectorizer.transform(test_df['joined_posts'])

# Combine text features with the keyword-based scores
X_train = hstack([X_train_text, train_scores_df])
X_test = hstack([X_test_text, test_scores_df])
y_train = train_df['depression_label']
y_test = test_df['depression_label']

# Train model on the entire training set
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

# Parameters for bootstrap
n_iterations = 100  # Number of bootstrap samples

n_size = X_test.shape[0]  # Use the size of the test set for each sample

# Arrays to store metrics
f1_scores = []
sensitivities = []
specificities = []

# Function to calculate sensitivity and specificity
def sensitivity_specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
    return sensitivity, specificity

# Bootstrap sampling on the fixed test set
for i in range(n_iterations):
    # Prepare a bootstrap sample from the test set
    X_sample, y_sample = resample(X_test, y_test, n_samples=n_size, replace=True)

    # Predict on the bootstrap sample
    y_pred = model.predict(X_sample)

    # Compute metrics
    f1 = f1_score(y_sample, y_pred)
    sensitivity, specificity = sensitivity_specificity(y_sample, y_pred)

    # Append to results
    f1_scores.append(f1)
    sensitivities.append(sensitivity)
    specificities.append(specificity)

# Calculate mean and standard deviation for each metric
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_sensitivity = np.mean(sensitivities)
std_sensitivity = np.std(sensitivities)
mean_specificity = np.mean(specificities)
std_specificity = np.std(specificities)

# Print results in the specified format
print(f"F1 Score: {mean_f1:.3f}±{std_f1:.3f}")
print(f"Sensitivity: {mean_sensitivity:.3f}±{std_sensitivity:.3f}")
print(f"Specificity: {mean_specificity:.3f}±{std_specificity:.3f}")


F1 Score: 0.493±0.024
Sensitivity: 0.434±0.026
Specificity: 0.954±0.004


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import hstack, csr_matrix

# Define keyword mapping
keyword_mapping = {
    "Feeling sad or down in the dumps": ["feeling sad", "down in the dumps", "depressed", "unhappy"],
    "Feeling unhappy or blue": ["feeling unhappy", "feeling blue", "downhearted", "gloomy"],
    "Crying spells or tearfulness": ["crying spells", "tearful", "tears", "weepy"],
    "Feeling discouraged": ["feeling discouraged", "disheartened", "hopeless", "pessimistic"],
    "Feeling hopeless": ["feeling hopeless", "despair", "no hope", "futile"],
    "Low self-esteem": ["low self-esteem", "poor self-image", "self-loathing", "insecure"],
    "Feeling worthless or inadequate": ["feeling worthless", "inadequate", "insignificant", "valueless"],
    "Guilt or shame": ["guilt", "shame", "remorseful", "guilty"],
    "Criticizing yourself or blaming others": ["self-criticism", "self-blame", "blaming others", "fault-finding"],
    "Difficulty making decisions": ["indecisive", "difficulty deciding", "hesitant", "uncertain"],
    "Loss of interest in family, friends or colleagues": ["loss of interest", "disinterested", "apathetic", "detached"],
    "Loneliness": ["loneliness", "isolated", "alone", "solitary"],
    "Spending less time with family or friends": ["less social", "avoiding people", "withdrawn", "reclusive"],
    "Loss of motivation": ["loss of motivation", "unmotivated", "lack of drive", "apathetic"],
    "Loss of interest in work or other activities": ["loss of interest in work", "disinterest in activities", "bored", "unengaged"],
    "Avoiding work or other activities": ["avoiding work", "shirking responsibilities", "neglecting tasks", "procrastinating"],
    "Loss of pleasure or satisfaction in life": ["anhedonia", "loss of pleasure", "joyless", "dissatisfaction"],
    "Feeling tired": ["feeling tired", "fatigued", "exhausted", "worn out"],
    "Difficulty sleeping or sleeping too much": ["insomnia", "oversleeping", "sleep disturbances", "restless sleep"],
    "Decreased or increased appetite": ["poor appetite", "overeating", "loss of appetite", "binge eating"],
    "Loss of interest in sex": ["loss of libido", "disinterest in sex", "sexual apathy", "low sex drive"],
    "Worrying about your health": ["health anxiety", "hypochondria", "preoccupied with health", "health worries"],
    "Do you have any suicidal thoughts?": ["suicidal thoughts", "thinking about suicide", "suicidal ideation", "self-harm thoughts"],
    "Would you like to end your life?": ["wanting to end life", "wishing for death", "suicidal desire", "thoughts of dying"],
    "Do you have a plan for harming yourself?": ["suicide plan", "self-harm plan", "planning suicide", "intent to self-harm"]
}

# Function to score a post based on keywords
def score_post(post, keyword_mapping):
    scores = {}
    for item, keywords in keyword_mapping.items():
        scores[item] = any(keyword in post for keyword in keywords)
    return scores

# Load the dataset
train_df = df_all

# Join the list of posts into a single string and convert to lowercase
train_df['joined_posts'] = train_df['selected_posts'].apply(lambda posts: ' '.join(posts).lower())

# Apply the scoring function to each post in the dataset
train_df['scores'] = train_df['joined_posts'].apply(lambda post: score_post(post, keyword_mapping))

# Convert scores to DataFrame
train_scores_df = pd.DataFrame(train_df['scores'].tolist())

# Create binary labels from the 'diseases' column
train_df['depression_label'] = train_df['diseases'].apply(lambda diseases: 1 if 'depression' in diseases else 0)

# Check the distribution of labels
print(train_df['depression_label'].value_counts())

# Extract text features using CountVectorizer
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(train_df['joined_posts'])

# Combine text features with the keyword-based scores
X = hstack([X_text, csr_matrix(train_scores_df)])
X = csr_matrix(X)  # Ensure the combined matrix is in CSR format
y = train_df['depression_label']

# Perform stratified 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Arrays to store metrics
f1_scores = []
sensitivities = []
specificities = []

# Function to calculate sensitivity and specificity
def sensitivity_specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
    return sensitivity, specificity

# Cross-validation loop
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train model on the training set
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Compute metrics
    f1 = f1_score(y_test, y_pred)
    sensitivity, specificity = sensitivity_specificity(y_test, y_pred)

    # Append to results
    f1_scores.append(f1)
    sensitivities.append(sensitivity)
    specificities.append(specificity)

# Calculate mean and standard deviation for each metric
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_sensitivity = np.mean(sensitivities)
std_sensitivity = np.std(sensitivities)
mean_specificity = np.mean(specificities)
std_specificity = np.std(specificities)

# Print results in the specified format
print(f"F1 Score: {mean_f1:.3f}±{std_f1:.3f}")
print(f"Sensitivity: {mean_sensitivity:.3f}±{std_sensitivity:.3f}")
print(f"Specificity: {mean_specificity:.3f}±{std_specificity:.3f}")




depression_label
0    23500
1     3105
Name: count, dtype: int64




F1 Score: 0.499±0.021
Sensitivity: 0.420±0.024
Specificity: 0.965±0.006


