In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import numpy as np

# 1. Load labeled data from QTL_text.json
with open('QTL_text.json', 'r', encoding='utf-8') as f:
    labeled_data = json.load(f)

df_labeled = pd.DataFrame(labeled_data)

# Combine Title & Abstract, lowercasing
df_labeled['text'] = (
    df_labeled['Title'].fillna('').str.lower() 
    + ' ' + 
    df_labeled['Abstract'].fillna('').str.lower()
)
df_labeled['Category'] = df_labeled['Category'].astype(int)

# 2. Split into Train/Dev sets
X_train, X_dev, y_train, y_dev = train_test_split(
    df_labeled['text'],
    df_labeled['Category'],
    test_size=0.2,
    random_state=42,
    stratify=df_labeled['Category']
)

# 3. Create a Pipeline with Tfidf + LogisticRegression
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=10000,
        stop_words='english',
        ngram_range=(1, 2)      # Includes unigrams + bigrams
    )),
    ('clf', LogisticRegression(
        class_weight='balanced', 
        max_iter=1000
    ))
])

# Train the model
text_clf.fit(X_train, y_train)

# 4. Evaluate on Dev (Standard threshold = 0.5)
y_pred = text_clf.predict(X_dev)
print("=== Standard Threshold (0.5) Results ===")
print(classification_report(y_dev, y_pred))

dev_proba = text_clf.predict_proba(X_dev)
proba_class1_dev = dev_proba[:, 1]  # Probability for class 1
threshold = 0.17
y_pred_custom = (proba_class1_dev >= threshold).astype(int)

print(f"=== Custom Threshold ({threshold}) Results ===")
print(classification_report(y_dev, y_pred_custom))

# 6. Load test data, ensuring correct column formats and lowercase Title+Abstract
df_test = pd.read_csv('test_unlabeled.tsv', sep='\t', dtype={'PMID': str})
df_test['text'] = (
    df_test['Title'].fillna('').str.lower() 
    + ' ' + 
    df_test['Abstract'].fillna('').str.lower()
)

# 7. Predict with the chosen threshold on Test
test_proba = text_clf.predict_proba(df_test['text'])
proba_class1_test = test_proba[:, 1]
df_test['Label'] = (proba_class1_test >= threshold).astype(int)

# 8. Save results in CSV format with correct header (PMID,Label)
df_test[['PMID', 'Label']].to_csv('M1.4_test_labeled.csv', sep=',', index=False, encoding="utf-8")

print(f"\nSaved predictions to 'M1.4_test_labeled.csv' with threshold={threshold}!")


=== Standard Threshold (0.5) Results ===
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2055
           1       0.65      0.92      0.76       201

    accuracy                           0.95      2256
   macro avg       0.82      0.93      0.87      2256
weighted avg       0.96      0.95      0.95      2256

=== Custom Threshold (0.17) Results ===
              precision    recall  f1-score   support

           0       1.00      0.87      0.93      2055
           1       0.44      1.00      0.61       201

    accuracy                           0.88      2256
   macro avg       0.72      0.94      0.77      2256
weighted avg       0.95      0.88      0.90      2256


Saved predictions to 'M1.4_test_labeled.csv' with threshold=0.17!
