In [None]:
import pickle
with open('/content/drive/MyDrive/Colab Notebooks/MDDdataset/symptom_sum_top16/train.pkl', 'rb') as f:
      raw_train_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab Notebooks/MDDdataset/symptom_sum_top16/test.pkl', 'rb') as f:
      raw_test_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab Notebooks/MDDdataset/symptom_sum_top16/val.pkl', 'rb') as f:
      raw_val_data = pickle.load(f)

In [None]:
import pandas as pd
train_data=pd.DataFrame(raw_train_data)
test_data=pd.DataFrame(raw_test_data)
val_data=pd.DataFrame(raw_val_data)

In [None]:
df_all = pd.concat([train_data, val_data], axis=0)
df_all = pd.concat([df_all, test_data], axis=0)

In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score
from imblearn.pipeline import Pipeline as ImbPipeline

# Assuming df_all is your training dataset and df_test is your test dataset
df_all = train_data
df_test = test_data

# Create a new column 'target' in the training data based on your criteria (e.g., 'bipolar')
df_all['target'] = df_all['diseases'].apply(lambda x: 1 if 'bipolar' in x else 0)
df_test['target'] = df_test['diseases'].apply(lambda x: 1 if 'bipolar' in x else 0)  # If you have labels in test data

# Define keywords related to your target condition
bipolar_keywords = [
    'hyper', 'irritable', 'self-confident', 'talkative', 'sleep', 'thoughts',
    'distracted', 'energy', 'active', 'social', 'sex', 'excessive', 'foolish', 'risky', 'spending'
]

# Function to count the occurrences of bipolar-related keywords in the posts
def count_bipolar_keywords(posts):
    # Joining posts if they are in list format
    posts = ' '.join(posts) if isinstance(posts, list) else posts
    return sum(keyword in posts.lower() for keyword in bipolar_keywords)

# Apply the function to the training and test data
df_all['bipolar_keyword_count'] = df_all['selected_posts'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x).apply(count_bipolar_keywords)
df_test['bipolar_keyword_count'] = df_test['selected_posts'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x).apply(count_bipolar_keywords)

# Prepare the data for training
X_train = df_all['selected_posts'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
y_train = df_all['target']
X_test = df_test['selected_posts'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
y_test = df_test['target']

# Define the pipeline with TF-IDF and Logistic Regression, handling class imbalance with SMOTE
pipeline = ImbPipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(solver='liblinear', class_weight='balanced'))
])

# Train the model on the entire training set
pipeline.fit(X_train, y_train)

# Perform bootstrap sampling 100 times on the test set
n_bootstraps = 100
f1_scores = []
sensitivity_scores = []
specificity_scores = []

for _ in range(n_bootstraps):
    # Bootstrap sample the test data
    X_boot, y_boot = resample(X_test, y_test, random_state=_)

    # Predict on the bootstrap sample of the test set
    y_pred = pipeline.predict(X_boot)

    # Compute metrics for this bootstrap sample
    f1_scores.append(f1_score(y_boot, y_pred))
    sensitivity_scores.append(recall_score(y_boot, y_pred))
    specificity_scores.append(recall_score(y_boot, y_pred, pos_label=0))

# Calculate average and standard deviation for each metric
average_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
average_sensitivity = np.mean(sensitivity_scores)
std_sensitivity = np.std(sensitivity_scores)
average_specificity = np.mean(specificity_scores)
std_specificity = np.std(specificity_scores)

# Print average metrics with standard deviations
print(f"F1 Score: {average_f1:.3f} ± {std_f1:.3f}")
print(f"Sensitivity: {average_sensitivity:.3f} ± {std_sensitivity:.3f}")
print(f"Specificity: {average_specificity:.3f} ± {std_specificity:.3f}")


F1 Score: 0.544 ± 0.033
Sensitivity: 0.594 ± 0.040
Specificity: 0.964 ± 0.004


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Define bipolar keywords related to the assessment scale
bipolar_keywords = [
    'hyper', 'irritable', 'self-confident', 'talkative', 'sleep', 'thoughts',
    'distracted', 'energy', 'active', 'social', 'sex', 'excessive', 'foolish', 'risky', 'spending'
]

# Function to count the occurrences of bipolar-related keywords in the posts
def count_bipolar_keywords(posts):
    # Joining posts if they are in list format
    posts = ' '.join(posts) if isinstance(posts, list) else posts
    return sum(keyword in posts.lower() for keyword in bipolar_keywords)

# Convert each entry in 'selected_posts' to a single string
df_all['selected_posts'] = df_all['selected_posts'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Apply the function to create a new feature based on keyword counts
df_all['bipolar_keyword_count'] = df_all['selected_posts'].apply(count_bipolar_keywords)

# Use the 'diseases' column to define the target label
df_all['target'] = df_all['diseases'].apply(lambda x: 1 if 'bipolar' in x else 0)

# Prepare the data for training and evaluation
X = df_all[['selected_posts', 'bipolar_keyword_count']]
y = df_all['target']

# Define the pipeline with a ColumnTransformer to manage both TF-IDF and keyword count features
pipeline = ImbPipeline([
    ('preprocessing', ColumnTransformer([
        ('tfidf', TfidfVectorizer(stop_words='english'), 'selected_posts'),
        ('keywords', 'passthrough', ['bipolar_keyword_count'])
    ], remainder='drop')),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

# Initialize 5-fold stratified cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
sensitivity_scores = []
specificity_scores = []

# Perform cross-validation on the entire dataset
for train_index, test_index in skf.split(X, y):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    pipeline.fit(X_train_fold, y_train_fold)

    # Predict on the validation fold
    y_pred = pipeline.predict(X_val_fold)

    # Compute metrics
    f1_scores.append(f1_score(y_val_fold, y_pred))
    sensitivity_scores.append(recall_score(y_val_fold, y_pred))
    specificity_scores.append(recall_score(y_val_fold, y_pred, pos_label=0))

# Calculate average and standard deviation for each metric
average_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
average_sensitivity = np.mean(sensitivity_scores)
std_sensitivity = np.std(sensitivity_scores)
average_specificity = np.mean(specificity_scores)
std_specificity = np.std(specificity_scores)

# Print average metrics with standard deviations
print(f"F1 Score: {average_f1:.3f} ± {std_f1:.3f}")
print(f"Sensitivity: {average_sensitivity:.3f} ± {std_sensitivity:.3f}")
print(f"Specificity: {average_specificity:.3f} ± {std_specificity:.3f}")



F1 Score: 0.308 ± 0.025
Sensitivity: 0.249 ± 0.025
Specificity: 0.980 ± 0.002


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Load your dataset (Assuming df_all is already loaded and processed as described before)
# df_all = pd.read_csv('path_to_your_train_data.csv')  # Uncomment and update with your dataset path if needed

# Define bipolar keywords related to the assessment scale
bipolar_keywords = [
    'hyper', 'irritable', 'self-confident', 'talkative', 'sleep', 'thoughts',
    'distracted', 'energy', 'active', 'social', 'sex', 'excessive', 'foolish', 'risky', 'spending'
]

# Function to count the occurrences of bipolar-related keywords in the posts
def count_bipolar_keywords(posts):
    # Joining posts if they are in list format
    posts = ' '.join(posts) if isinstance(posts, list) else posts
    return sum(keyword in posts.lower() for keyword in bipolar_keywords)

# Convert each entry in 'selected_posts' to a single string
df_all['selected_posts'] = df_all['selected_posts'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Apply the function to create a new feature based on keyword counts
df_all['bipolar_keyword_count'] = df_all['selected_posts'].apply(count_bipolar_keywords)

# Use the 'diseases' column to define the target label
df_all['target'] = df_all['diseases'].apply(lambda x: 1 if 'bipolar' in x else 0)

# Prepare the data for training and evaluation
X = df_all[['selected_posts', 'bipolar_keyword_count']]
y = df_all['target']

# Define the pipeline with a ColumnTransformer to manage both CountVectorizer and keyword count features
pipeline = ImbPipeline([
    ('preprocessing', ColumnTransformer([
        ('count_vectorizer', CountVectorizer(stop_words='english'), 'selected_posts'),
        ('keywords', 'passthrough', ['bipolar_keyword_count'])
    ], remainder='drop')),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

# Initialize 5-fold stratified cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
sensitivity_scores = []
specificity_scores = []

# Perform cross-validation on the entire dataset
for train_index, test_index in skf.split(X, y):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    pipeline.fit(X_train_fold, y_train_fold)

    # Predict on the validation fold
    y_pred = pipeline.predict(X_val_fold)

    # Compute metrics
    f1_scores.append(f1_score(y_val_fold, y_pred))
    sensitivity_scores.append(recall_score(y_val_fold, y_pred))
    specificity_scores.append(recall_score(y_val_fold, y_pred, pos_label=0))

# Calculate average and standard deviation for each metric
average_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
average_sensitivity = np.mean(sensitivity_scores)
std_sensitivity = np.std(sensitivity_scores)
average_specificity = np.mean(specificity_scores)
std_specificity = np.std(specificity_scores)

# Print average metrics with standard deviations
print(f"F1 Score: {average_f1:.3f} ± {std_f1:.3f}")
print(f"Sensitivity: {average_sensitivity:.3f} ± {std_sensitivity:.3f}")
print(f"Specificity: {average_specificity:.3f} ± {std_specificity:.3f}")


F1 Score: 0.304 ± 0.020
Sensitivity: 0.219 ± 0.020
Specificity: 0.988 ± 0.003


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Load your dataset (Assuming df_all is already loaded and processed as described before)
# df_all = pd.read_csv('path_to_your_train_data.csv')  # Uncomment and update with your dataset path if needed

# Define bipolar keywords related to the assessment scale
bipolar_keywords = [
    'hyper', 'irritable', 'self-confident', 'talkative', 'sleep', 'thoughts',
    'distracted', 'energy', 'active', 'social', 'sex', 'excessive', 'foolish', 'risky', 'spending'
]

# Function to count the occurrences of bipolar-related keywords in the posts
def count_bipolar_keywords(posts):
    # Joining posts if they are in list format
    posts = ' '.join(posts) if isinstance(posts, list) else posts
    return sum(keyword in posts.lower() for keyword in bipolar_keywords)

# Convert each entry in 'selected_posts' to a single string
df_all['selected_posts'] = df_all['selected_posts'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Apply the function to create a new feature based on keyword counts
df_all['bipolar_keyword_count'] = df_all['selected_posts'].apply(count_bipolar_keywords)

# Use the 'diseases' column to define the target label
df_all['target'] = df_all['diseases'].apply(lambda x: 1 if 'bipolar' in x else 0)

# Prepare the data for training and evaluation
X = df_all[['selected_posts', 'bipolar_keyword_count']]
y = df_all['target']

# Define the pipeline with a ColumnTransformer to manage both CountVectorizer and keyword count features
pipeline = ImbPipeline([
    ('preprocessing', ColumnTransformer([
        ('count_vectorizer', CountVectorizer(stop_words='english'), 'selected_posts'),
        ('keywords', 'passthrough', ['bipolar_keyword_count'])
    ], remainder='drop')),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(solver='liblinear', class_weight='balanced'))
])

# Initialize 5-fold stratified cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
sensitivity_scores = []
specificity_scores = []

# Perform cross-validation on the entire dataset
for train_index, test_index in skf.split(X, y):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    pipeline.fit(X_train_fold, y_train_fold)

    # Predict on the validation fold
    y_pred = pipeline.predict(X_val_fold)

    # Compute metrics
    f1_scores.append(f1_score(y_val_fold, y_pred))
    sensitivity_scores.append(recall_score(y_val_fold, y_pred))
    specificity_scores.append(recall_score(y_val_fold, y_pred, pos_label=0))

# Calculate average and standard deviation for each metric
average_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
average_sensitivity = np.mean(sensitivity_scores)
std_sensitivity = np.std(sensitivity_scores)
average_specificity = np.mean(specificity_scores)
std_specificity = np.std(specificity_scores)

# Print average metrics with standard deviations
print(f"F1 Score: {average_f1:.3f} ± {std_f1:.3f}")
print(f"Sensitivity: {average_sensitivity:.3f} ± {std_sensitivity:.3f}")
print(f"Specificity: {average_specificity:.3f} ± {std_specificity:.3f}")




F1 Score: 0.480 ± 0.031
Sensitivity: 0.435 ± 0.033
Specificity: 0.980 ± 0.001
