In [4]:
import pickle
with open('/content/drive/MyDrive/Colab Notebooks/MDDdataset/symptom_sum_top16/train.pkl', 'rb') as f:
      raw_train_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab Notebooks/MDDdataset/symptom_sum_top16/test.pkl', 'rb') as f:
      raw_test_data = pickle.load(f)
with open('/content/drive/MyDrive/Colab Notebooks/MDDdataset/symptom_sum_top16/val.pkl', 'rb') as f:
      raw_val_data = pickle.load(f)

In [14]:
import pandas as pd
train_data=pd.DataFrame(raw_train_data)
test_data=pd.DataFrame(raw_test_data)
val_data=pd.DataFrame(raw_val_data)

In [17]:
df_all = pd.concat([train_data, val_data], axis=0)
df_all = pd.concat([df_all, test_data], axis=0)

In [7]:
|
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.utils import resample

# Load your datasets
train = train_data
test = test_data


# Function to convert a string representation of a list into a single string of posts
def convert_posts(posts):
    if isinstance(posts, str):
        try:
            # Attempt to evaluate the string as a list
            posts_list = eval(posts)
            if isinstance(posts_list, list):
                return ' '.join(posts_list)
        except:
            return posts
    return ' '.join(posts)

# Convert the list of posts into a single string for each entry
train['selected_posts'] = train['selected_posts'].apply(convert_posts)
test['selected_posts'] = test['selected_posts'].apply(convert_posts)

# Extract features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train['selected_posts'])
X_test = vectorizer.transform(test['selected_posts'])

# Function to check if 'adhd' is in the diseases list
def check_adhd(diseases):
    # Check if 'adhd' is in the list of diseases
    return 'adhd' in [disease.lower() for disease in diseases]

# Label the training and test data for ADHD
train['has_adhd'] = train['diseases'].apply(check_adhd)
test['has_adhd'] = test['diseases'].apply(check_adhd)

# Train a classifier
classifier = LogisticRegression()
classifier.fit(X_train, train['has_adhd'])

# Predict probabilities on the test data
test['adhd_probability'] = classifier.predict_proba(X_test)[:, 1]

# Convert probabilities to ADHD ratings based on thresholds
def probability_to_rating(prob):
    if prob >= 0.75:
        return 'Very much'
    elif prob >= 0.5:
        return 'Pretty much'
    elif prob >= 0.25:
        return 'Somewhat'
    else:
        return 'Not at all'

test['adhd_rating'] = test['adhd_probability'].apply(probability_to_rating)

# Function to convert ratings to binary predictions for metrics calculation
def rating_to_binary(rating):
    return rating in ['Very much', 'Pretty much']

# Convert ratings to binary predictions
test['predicted_adhd'] = test['adhd_rating'].apply(rating_to_binary)

# Bootstrap sampling to estimate metrics and their standard deviations
def bootstrap_metrics(data, n_iterations=100):
    f1_scores, sensitivities, specificities = [], [], []
    for _ in range(n_iterations):
        sample = resample(data)
        f1, sensitivity, specificity = compute_metrics(sample['has_adhd'], sample['predicted_adhd'])
        f1_scores.append(f1)
        sensitivities.append(sensitivity)
        specificities.append(specificity)

    return {
        "f1_mean": np.mean(f1_scores),
        "f1_std": np.std(f1_scores),
        "sensitivity_mean": np.mean(sensitivities),
        "sensitivity_std": np.std(sensitivities),
        "specificity_mean": np.mean(specificities),
        "specificity_std": np.std(specificities)
    }

# Function to compute metrics
def compute_metrics(actual, predicted):
    f1 = f1_score(actual, predicted)
    sensitivity = recall_score(actual, predicted)
    specificity = recall_score(actual, predicted, pos_label=False)
    return f1, sensitivity, specificity

# Compute and print the metrics using bootstrapped sampling
metrics = bootstrap_metrics(test)
print(metrics)

{'f1_mean': 0.3933247814520313, 'f1_std': 0.03201271326899809, 'sensitivity_mean': 0.260814402261038, 'sensitivity_std': 0.026278254107860666, 'specificity_mean': 0.9936039792254308, 'specificity_std': 0.001607758171108381}


In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, recall_score, precision_score
from scipy.sparse import hstack, csr_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load dataset
data = df_all

# Preprocess posts
def preprocess_posts(posts):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(posts.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords.words('english')]
    return ' '.join(filtered_tokens)

# Function to calculate severity scores based on ADHD symptoms
def calculate_severity_scores(text):
    severity_mapping = {
        "not at all": 0,
        "somewhat": 1,
        "pretty much": 2,
        "very much": 3
    }
    symptoms = {
        "attention": ["careless", "inattentive", "distracted"],
        "hyperactivity": ["restless", "fidgety"],
        "impulsivity": ["interrupts", "impatient"]
    }
    score = 0
    words = text.split()
    for word in words:
        for group, keywords in symptoms.items():
            if any(keyword in word for keyword in keywords):
                score += severity_mapping.get(word, 0)
    return score

# Process text data and compute scores
data['processed_posts'] = data['selected_posts'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
data['processed_posts'] = data['processed_posts'].apply(preprocess_posts)
data['severity_scores'] = data['processed_posts'].apply(calculate_severity_scores)

# Extract features using CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_text_features = vectorizer.fit_transform(data['processed_posts'])
X_severity_scores = np.array(data['severity_scores']).reshape(-1, 1)

# Combine features into one matrix and convert to CSR format
X = hstack([X_text_features, X_severity_scores]).tocsr()

# Define labels
y = data['diseases'].apply(lambda diseases: 'adhd' in [d.lower() for d in diseases]).values

# Initialize Logistic Regression
classifier = LogisticRegression()

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
f1_scores, sensitivities, specificities = [], [], []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the classifier
    classifier.fit(X_train, y_train)

    # Predict outcomes
    y_pred = classifier.predict(X_test)

    # Compute metrics
    f1_scores.append(f1_score(y_test, y_pred))
    sensitivities.append(recall_score(y_test, y_pred))
    specificities.append(recall_score(y_test, y_pred, pos_label=False))

# Compute and display metrics
metrics = {
    "f1": f"{np.mean(f1_scores):.4f}±{np.std(f1_scores):.4f}",
    "sensitivity": f"{np.mean(sensitivities):.4f}±{np.std(sensitivities):.4f}",
    "specificity": f"{np.mean(specificities):.4f}±{np.std(specificities):.4f}"
}
print("Metrics:", metrics)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn

Metrics: {'f1': '0.4144±0.0336', 'sensitivity': '0.3968±0.0303', 'specificity': '0.9492±0.0045'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
