In [None]:
import pickle
with open('/content/drive/MyDrive/Colab_Notebooks/MDDdataset/symptom_sum_top16/train.pkl', 'rb') as f:
      raw_train_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab_Notebooks/MDDdataset/symptom_sum_top16/test.pkl', 'rb') as f:
      raw_test_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab_Notebooks/MDDdataset/symptom_sum_top16/val.pkl', 'rb') as f:
      raw_val_data = pickle.load(f)

In [None]:
import pandas as pd
train_data=pd.DataFrame(raw_train_data)
test_data=pd.DataFrame(raw_test_data)
val_data=pd.DataFrame(raw_val_data)

In [None]:
df_all = pd.concat([train_data, val_data], axis=0)
df_all = pd.concat([df_all, test_data], axis=0)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure necessary NLTK data is available
nltk.download('punkt')
nltk.download('stopwords')

# Load your dataset
data = df_all

# More refined schizophrenia-related keywords
schizophrenia_keywords = [
    'hallucinations', 'delusions', 'disorganized speech',
    'negative symptoms', 'catatonia', 'thought disorder',
    'paranoia', 'auditory hallucinations', 'visual hallucinations',
    'olfactory hallucinations', 'tactile hallucinations',
    'delusion of grandeur', 'delusion of persecution',
    'thought insertion', 'thought broadcasting', 'withdrawal',
    'anhedonia', 'alogia', 'avolition'
]

# Function to process text and match keywords
def analyze_posts_for_schizophrenia(posts, keywords):
    total_scores = []
    distress_scores = []
    for post_list in posts:
        # Combine all posts in the list into a single string
        post = ' '.join(post_list).lower()
        tokens = word_tokenize(post)
        tokens = [word for word in tokens if word not in stopwords.words('english')]

        # Simple keyword matching for demonstration
        keyword_count = sum(1 for word in tokens if word in keywords)
        total_scores.append(min(keyword_count, 21))  # Cap the total score
        distress_scores.append(min(keyword_count * 3, 105))  # Cap the distress score
    return total_scores, distress_scores

# Diagnosis criteria adjustment for improved sensitivity and specificity
def diagnose_schizophrenia(total_score, distress_score):
    if total_score >= 5 and distress_score >= 15:
        return 'Ultra High Risk/Psychotic Syndrome (Balanced)'
    elif total_score >= 5:
        return 'Ultra High Risk/Psychotic Syndrome (Maximizing Sensitivity)'
    elif distress_score >= 15:
        return 'Ultra High Risk/Psychotic Syndrome (Maximizing Specificity)'
    else:
        return 'No Psychotic Spectrum Diagnosis'

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store metrics
f1_scores = []
sensitivities = []
specificities = []

# Perform 5-fold cross-validation
for train_index, test_index in kf.split(data):
    train_data, test_data = data.iloc[train_index], data.iloc[test_index]

    # Analyze the posts for schizophrenia symptoms
    train_scores, train_distress = analyze_posts_for_schizophrenia(train_data['selected_posts'], schizophrenia_keywords)
    test_scores, test_distress = analyze_posts_for_schizophrenia(test_data['selected_posts'], schizophrenia_keywords)

    # Update data to include scores
    train_data = train_data.copy()
    test_data = test_data.copy()
    train_data['schizophrenia_total_score'] = train_scores
    train_data['schizophrenia_distress_score'] = train_distress
    test_data['schizophrenia_total_score'] = test_scores
    test_data['schizophrenia_distress_score'] = test_distress

    # Diagnose based on the criteria
    train_data['diagnosis'] = [diagnose_schizophrenia(score, distress) for score, distress in zip(train_data['schizophrenia_total_score'], train_data['schizophrenia_distress_score'])]
    test_data['diagnosis'] = [diagnose_schizophrenia(score, distress) for score, distress in zip(test_data['schizophrenia_total_score'], test_data['schizophrenia_distress_score'])]

    # Convert 'diseases' column into a binary indicator for schizophrenia presence
    true_labels = test_data['diseases'].apply(lambda x: 'schizophrenia' in x).astype(int)
    pred_labels = test_data['diagnosis'].apply(lambda x: 'Ultra High Risk/Psychotic Syndrome' in x).astype(int)

    # Calculate F1 score
    f1 = f1_score(true_labels, pred_labels)
    f1_scores.append(f1)

    # Calculate confusion matrix components
    tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels).ravel()

    # Calculate sensitivity (recall) and specificity
    sensitivity = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Append metrics
    sensitivities.append(sensitivity)
    specificities.append(specificity)

# Calculate the average and standard deviation of performance metrics
average_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

average_sensitivity = np.mean(sensitivities)
std_sensitivity = np.std(sensitivities)

average_specificity = np.mean(specificities)
std_specificity = np.std(specificities)

# Print results with the specified format
print(f'Average F1 Score: {average_f1:.2f}±{std_f1:.3f}')
print(f'Average Sensitivity: {average_sensitivity:.2f}±{std_sensitivity:.3f}')
print(f'Average Specificity: {average_specificity:.2f}±{std_specificity:.3f}')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Average F1 Score: 0.22±0.019
Average Sensitivity: 0.16±0.027
Average Specificity: 1.00±0.001


In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, f1_score


# Define the function to analyze posts and score them
def analyze_posts_for_schizophrenia(posts, keywords):
    total_scores = []
    distress_scores = []
    for post in posts:
        if isinstance(post, list):
            post = ' '.join(post)
        elif isinstance(post, str):
            post = post
        else:
            continue  # Skip if the post is neither a list nor a string

        keyword_count = sum(1 for keyword in keywords if keyword.lower() in post.lower())
        total_scores.append(min(keyword_count, 21))  # Total score is capped at 21
        distress_scores.append(min(keyword_count * 3, 105))  # Distress score is capped at 105 (21 items * 5 points)
    return total_scores, distress_scores

# Define the diagnosis criteria
def diagnose_schizophrenia(total_score, distress_score):
    if total_score >= 1 and distress_score >= 6:
        return 'Ultra High Risk/Psychotic Syndrome (Balanced)'
    elif total_score >= 1:
        return 'Ultra High Risk/Psychotic Syndrome (Maximizing Sensitivity)'
    elif total_score >= 6:
        return 'Ultra High Risk/Psychotic Syndrome (Maximizing Specificity)'
    else:
        return 'No Psychotic Spectrum Diagnosis'

# Number of bootstrap samples
n_bootstraps = 100

# Initialize lists to store metrics for each bootstrap sample
bootstrap_f1_scores = []
bootstrap_sensitivities = []
bootstrap_specificities = []

# Perform bootstrapped sampling
for _ in range(n_bootstraps):
    # Resample the test data with replacement
    sample_data = resample(test_data, replace=True, n_samples=len(test_data), random_state=None)

    # Analyze the posts for schizophrenia symptoms
    sample_data['schizophrenia_total_score'], sample_data['schizophrenia_distress_score'] = analyze_posts_for_schizophrenia(sample_data['selected_posts'], schizophrenia_keywords)

    # Diagnose based on the criteria
    sample_data['diagnosis'] = sample_data.apply(lambda row: diagnose_schizophrenia(row['schizophrenia_total_score'], row['schizophrenia_distress_score']), axis=1)

    # Convert 'diseases' column into a binary indicator for schizophrenia presence
    true_labels = sample_data['diseases'].apply(lambda x: 'schizophrenia' in x).astype(int)
    pred_labels = (sample_data['diagnosis'].str.contains('Ultra High Risk/Psychotic Syndrome')).astype(int)

    # Calculate F1 score
    f1 = f1_score(true_labels, pred_labels)
    bootstrap_f1_scores.append(f1)

    # Calculate confusion matrix components
    tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels).ravel()

    # Calculate sensitivity (recall) and specificity
    sensitivity = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Append metrics
    bootstrap_sensitivities.append(sensitivity)
    bootstrap_specificities.append(specificity)

# Calculate the average and standard deviation of performance metrics
average_f1 = np.mean(bootstrap_f1_scores)
std_f1 = np.std(bootstrap_f1_scores)

average_sensitivity = np.mean(bootstrap_sensitivities)
std_sensitivity = np.std(bootstrap_sensitivities)

average_specificity = np.mean(bootstrap_specificities)
std_specificity = np.std(bootstrap_specificities)

# Print results with the specified format
print(f'Average F1 Score: {average_f1:.2f}±{std_f1:.3f}')
print(f'Average Sensitivity: {average_sensitivity:.2f}±{std_sensitivity:.3f}')
print(f'Average Specificity: {average_specificity:.2f}±{std_specificity:.3f}')


Average F1 Score: 0.02±0.003
Average Sensitivity: 0.99±0.015
Average Specificity: 0.09±0.004


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, recall_score, accuracy_score

# Assuming df_all is your DataFrame and it's already loaded
data = df_all  # This should be replaced with your actual DataFrame variable
data['combined_posts'] = data['selected_posts'].apply(lambda x: ' '.join(x).lower())

# Schizophrenia keywords from the checklist
schizophrenia_keywords = [
    'hallucinations', 'delusions', 'disorganized speech', 'negative symptoms',
    'catatonia', 'thought disorder', 'paranoia', 'auditory hallucinations',
    'visual hallucinations', 'olfactory hallucinations', 'tactile hallucinations',
    'delusion of grandeur', 'delusion of persecution', 'thought insertion',
    'thought broadcasting', 'withdrawal', 'anhedonia', 'alogia', 'avolition'
]

# Function to count the occurrences of schizophrenia-related keywords in posts
def keyword_count(text, keywords):
    return sum(text.count(keyword) for keyword in keywords)

# Apply the function to create a new feature
data['keyword_score'] = data['combined_posts'].apply(lambda x: keyword_count(x, schizophrenia_keywords))

# Prepare data for modeling
X = data[['combined_posts', 'keyword_score']]
y = data['diseases'].apply(lambda x: 1 if 'schizophrenia' in x else 0)  # Adjust depending on how schizophrenia is labeled

# Preprocessing and Model Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('txt', CountVectorizer(stop_words='english'), 'combined_posts'),
        ('num', 'passthrough', ['keyword_score'])
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))  # Using 'liblinear' for binary classification
])

# 5-Fold Cross-Validation Setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
sensitivities = []
specificities = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    f1 = f1_score(y_test, y_pred)
    f1_scores.append(f1)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) != 0 else 0
    sensitivities.append(sensitivity)

    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    specificities.append(specificity)

# Output the average and standard deviation of scores
print(f'Average F1 Score: {np.mean(f1_scores):.2f}±{np.std(f1_scores):.3f}')
print(f'Average Sensitivity: {np.mean(sensitivities):.2f}±{np.std(sensitivities):.3f}')
print(f'Average Specificity: {np.mean(specificities):.2f}±{np.std(specificities):.3f}')


Average F1 Score: 0.22±0.064
Average Sensitivity: 0.16±0.045
Average Specificity: 1.00±0.001
