# Fake or Real: Baseline Model

This notebook trains a logistic regression model using TF-IDF features to classify real vs fake texts. A simple cross-validation evaluation is provided and predictions are generated for the competition test set.

In [None]:
import os, glob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

# Load training data
train_csv = pd.read_csv("data/train.csv")
base_dir = 'data/train'
texts, labels, article_ids = [], [], []
for idx, row in train_csv.iterrows():
    art = f"article_{idx:04d}"
    for i in [1,2]:
        with open(os.path.join(base_dir, art, f"file_{i}.txt")) as f:
            texts.append(f.read())
        labels.append(1 if row['real_text_id']==i else 0)
        article_ids.append(idx)

vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3,5), max_features=8000)
X = vectorizer.fit_transform(texts)
y = labels

kf = KFold(n_splits=5, shuffle=True, random_state=42)
acc = []
for tr, va in kf.split(X):
    clf = LogisticRegression(max_iter=500)
    clf.fit(X[tr], [y[i] for i in tr])
    probs = clf.predict_proba(X[va])[:,1]
    pair = {}
    for idx_, p in zip(va, probs):
        pair.setdefault(article_ids[idx_], []).append((p, idx_))
    corr = tot = 0
    for vals in pair.values():
        if len(vals)==2:
            (p1,i1),(p2,i2)=vals
            pred = i1 if p1>=p2 else i2
            if y[pred]==1:
                corr +=1
            tot +=1
    acc.append(corr/tot)
print('Cross-validation pairwise accuracy:', sum(acc)/len(acc))

# Train on full data
clf = LogisticRegression(max_iter=500)
clf.fit(X, y)

# Generate predictions for test set
texts_test=[]
ids=[]
for path in sorted(glob.glob('data/test/*')):
    art_id=int(os.path.basename(path).split('_')[1])
    with open(os.path.join(path,'file_1.txt')) as f:
        t1=f.read()
    with open(os.path.join(path,'file_2.txt')) as f:
        t2=f.read()
    texts_test.extend([t1,t2])
    ids.append(art_id)
X_test=vectorizer.transform(texts_test)
probs=clf.predict_proba(X_test)[:,1]
res=[]
for i,aid in enumerate(ids):
    p1=probs[2*i]; p2=probs[2*i+1]
    real=1 if p1>=p2 else 2
    res.append({'id': aid, 'real_text_id': real})
sub=pd.DataFrame(res).sort_values('id')
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv')