In [None]:
import pickle
with open('/content/drive/MyDrive/Colab_Notebooks/MDDdataset/symptom_sum_top16/train.pkl', 'rb') as f:
      raw_train_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab_Notebooks/MDDdataset/symptom_sum_top16/test.pkl', 'rb') as f:
      raw_test_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab_Notebooks/MDDdataset/symptom_sum_top16/val.pkl', 'rb') as f:
      raw_val_data = pickle.load(f)

In [None]:
import pandas as pd
train_data=pd.DataFrame(raw_train_data)
test_data=pd.DataFrame(raw_test_data)
val_data=pd.DataFrame(raw_val_data)

In [None]:
df_all = pd.concat([train_data, val_data], axis=0)
df_all = pd.concat([df_all, test_data], axis=0)

In [None]:
ptsd_keywords = {
    1: ['memories', 'flashback', 'trauma', 'stressful', 'experience'],
    2: ['dreams', 'nightmare', 'stressful'],
    3: ['reliving', 'reexperiencing', 'flashback', 'stressful'],
    4: ['upset', 'reminder', 'stressful', 'experience'],
    5: ['physical', 'reaction', 'reminder', 'stressful', 'heart', 'breathing', 'sweating'],
    6: ['avoiding', 'memories', 'thoughts', 'feelings', 'stressful'],
    7: ['avoiding', 'reminders', 'stressful', 'people', 'places', 'conversations', 'activities'],
    8: ['trouble', 'remembering', 'stressful', 'experience'],
    9: ['negative', 'beliefs', 'thoughts', 'bad', 'wrong', 'trust', 'dangerous'],
    10: ['blaming', 'self', 'others', 'stressful', 'experience'],
    11: ['negative', 'feelings', 'fear', 'horror', 'anger', 'guilt', 'shame'],
    12: ['loss', 'interest', 'activities', 'enjoy'],
    13: ['feeling', 'distant', 'cut off', 'people'],
    14: ['trouble', 'positive', 'feelings', 'happiness', 'love'],
    15: ['irritable', 'anger', 'outbursts', 'aggressive'],
    16: ['risks', 'harm'],
    17: ['superalert', 'watchful', 'guard'],
    18: ['jumpy', 'startled'],
    19: ['difficulty', 'concentrating'],
    20: ['trouble', 'sleeping', 'falling', 'asleep', 'staying', 'asleep']
}


In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, confusion_matrix


# Preprocess text and calculate PTSD score
def preprocess_and_score_posts(posts, keywords):
    # Ensure input is a string
    posts = str(posts)
    posts = re.sub(r'[^a-zA-Z\s]', '', posts).lower().split()
    word_counts = Counter(posts)
    score = sum(word_counts[keyword] for keyword_list in keywords.values() for keyword in keyword_list)
    return score

# Apply the scoring function
df_all['ptsd_score'] = df_all['selected_posts'].apply(lambda posts: preprocess_and_score_posts(posts, ptsd_keywords))

# Define a ground truth for PTSD based on existing diseases column
df_all['ground_truth'] = df_all['diseases'].apply(lambda x: 1 if 'ptsd' in x else 0)

# Initialize cross-validation and metrics
kf = KFold(n_splits=5, shuffle=True, random_state=42)
f1_scores, sensitivities, specificities = [], [], []

# Perform cross-validation
for train_index, test_index in kf.split(df_all):
    train_data, test_data = df_all.iloc[train_index], df_all.iloc[test_index]

    # Threshold for PTSD
    threshold = 31

    # Predict based on score
    predictions = (test_data['ptsd_score'] > threshold).astype(int)

    # Compute F1 score, sensitivity, and specificity
    f1 = f1_score(test_data['ground_truth'], predictions)
    f1_scores.append(f1)

    tn, fp, fn, tp = confusion_matrix(test_data['ground_truth'], predictions).ravel()
    sensitivity = tp / (tp + fn) if tp + fn else 0
    specificity = tn / (tn + fp) if tn + fp else 0
    sensitivities.append(sensitivity)
    specificities.append(specificity)

# Calculate mean and standard deviation for each metric
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_sensitivity = np.mean(sensitivities)
std_sensitivity = np.std(sensitivities)
mean_specificity = np.mean(specificities)
std_specificity = np.std(specificities)

# Print the results in the requested format
print(f"F1 Score: {mean_f1:.2f}±{std_f1:.3f}")
print(f"Sensitivity: {mean_sensitivity:.2f}±{std_sensitivity:.3f}")
print(f"Specificity: {mean_specificity:.2f}±{std_specificity:.3f}")



F1 Score: 0.06±0.005
Sensitivity: 0.82±0.032
Specificity: 0.63±0.005


In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, confusion_matrix
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize PorterStemmer
stemmer = PorterStemmer()

# Preprocess text and calculate PTSD score with improved keyword matching and stemming
def preprocess_and_score_posts(posts, keywords):
    # Ensure input is a string
    posts = str(posts)
    # Tokenize and stem words
    tokens = word_tokenize(posts.lower())
    stemmed_words = [stemmer.stem(word) for word in tokens]
    # Remove non-alphabetical characters
    stemmed_words = [re.sub(r'[^a-zA-Z]', '', word) for word in stemmed_words if re.sub(r'[^a-zA-Z]', '', word)]
    # Count word occurrences
    word_counts = Counter(stemmed_words)
    score = sum(word_counts[keyword] for keyword_list in keywords.values() for keyword in keyword_list)
    return score


# Apply the scoring function
df_all['ptsd_score'] = df_all['selected_posts'].apply(lambda posts: preprocess_and_score_posts(posts, ptsd_keywords))

# Define a ground truth for PTSD based on existing diseases column
df_all['ground_truth'] = df_all['diseases'].apply(lambda x: 1 if 'ptsd' in x else 0)

# Initialize cross-validation and metrics
kf = KFold(n_splits=5, shuffle=True, random_state=42)
f1_scores, sensitivities, specificities = [], [], []

# Perform cross-validation
for train_index, test_index in kf.split(df_all):
    train_data, test_data = df_all.iloc[train_index], df_all.iloc[test_index]

    # Adjust threshold based on model tuning
    threshold = 31

    # Predict based on score
    predictions = (test_data['ptsd_score'] > threshold).astype(int)

    # Compute F1 score, sensitivity, and specificity
    f1 = f1_score(test_data['ground_truth'], predictions)
    f1_scores.append(f1)

    tn, fp, fn, tp = confusion_matrix(test_data['ground_truth'], predictions).ravel()
    sensitivity = tp / (tp + fn) if tp + fn else 0
    specificity = tn / (tn + fp) if tn + fp else 0
    sensitivities.append(sensitivity)
    specificities.append(specificity)

# Calculate mean and standard deviation for each metric
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_sensitivity = np.mean(sensitivities)
std_sensitivity = np.std(sensitivities)
mean_specificity = np.mean(specificities)
std_specificity = np.std(specificities)

# Print the results in the requested format
print(f"F1 Score: {mean_f1:.2f}±{std_f1:.3f}")
print(f"Sensitivity: {mean_sensitivity:.2f}±{std_sensitivity:.3f}")
print(f"Specificity: {mean_specificity:.2f}±{std_specificity:.3f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


F1 Score: 0.13±0.017
Sensitivity: 0.36±0.039
Specificity: 0.94±0.003


In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.utils import resample
from sklearn.metrics import f1_score, confusion_matrix

# Function to preprocess and score posts
def preprocess_and_score_posts(posts, keywords):
    # Ensure input is a string
    posts = str(posts)
    posts = re.sub(r'[^a-zA-Z\s]', '', posts).lower().split()
    word_counts = Counter(posts)
    score = sum(word_counts[keyword] for keyword_list in keywords.values() for keyword in keyword_list)
    return score

# Apply the scoring function to test data
test_data['ptsd_score'] = test_data['selected_posts'].apply(lambda posts: preprocess_and_score_posts(posts, ptsd_keywords))

# Initialize lists to hold the metrics from each bootstrap sample
f1_scores, sensitivities, specificities = [], [], []

# Number of bootstrap samples
n_iterations = 100

# Bootstrapping
for _ in range(n_iterations):
    # Resample the test data
    bootstrapped_sample = resample(test_data, replace=True, n_samples=len(test_data), random_state=None)

    # Predict based on the PTSD score with threshold of 31
    predictions = (bootstrapped_sample['ptsd_score'] > 31).astype(int)

    # Normally, you need actual labels to calculate these metrics
    # Simulate ground truth for demonstration (replace this with actual data if available)
    simulated_truth = np.random.randint(0, 2, len(bootstrapped_sample))

    # Calculate F1 score, sensitivity, and specificity
    f1 = f1_score(simulated_truth, predictions)
    tn, fp, fn, tp = confusion_matrix(simulated_truth, predictions).ravel()
    sensitivity = tp / (tp + fn) if tp + fn else 0
    specificity = tn / (tn + fp) if tn + fp else 0

    # Append the results to the lists
    f1_scores.append(f1)
    sensitivities.append(sensitivity)
    specificities.append(specificity)

# Calculate the mean and standard deviation for each metric
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_sensitivity = np.mean(sensitivities)
std_sensitivity = np.std(sensitivities)
mean_specificity = np.mean(specificities)
std_specificity = np.std(specificities)

# Print the results in the requested format
print(f"F1 Score: {mean_f1:.2f}±{std_f1:.3f}")
print(f"Sensitivity: {mean_sensitivity:.2f}±{std_sensitivity:.3f}")
print(f"Specificity: {mean_specificity:.2f}±{std_specificity:.3f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['ptsd_score'] = test_data['selected_posts'].apply(lambda posts: preprocess_and_score_posts(posts, ptsd_keywords))


F1 Score: 0.43±0.009
Sensitivity: 0.38±0.010
Specificity: 0.62±0.010


In [None]:
count=0
for value in predictions:
  if value==1:
    count+=1
count

8