In [11]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [12]:
DATA_DIR = Path("op_spam_v1.4")
neg_dir = DATA_DIR / "negative_polarity"

sources = {
    "deceptive_from_MTurk": 1, 
    "truthful_from_Web": 0
}

In [13]:
rows = []
for src, y in sources.items():
    for fold_name in sorted((neg_dir / src).rglob("fold*")):
        fold_id = int(fold_name.name[-1])
        for fp in fold_name.rglob("*.txt"):
            txt = Path(fp).read_text(encoding="utf-8", errors="ignore")
            rows.append({"text": txt, "label": y, "fold": fold_id, "path": str(fp)})

df = pd.DataFrame(rows).sample(frac=1.0, random_state=42).reset_index(drop=True)

train_mask = df["fold"].isin([1,2,3,4])
test_mask  = df["fold"] == 5
X_train, y_train, g_train = df.loc[train_mask, "text"], df.loc[train_mask, "label"], df.loc[train_mask, "fold"]
X_test,  y_test           = df.loc[test_mask,  "text"], df.loc[test_mask,  "label"]


### Random Forest

In [None]:
rf_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9)),
    ("svd", TruncatedSVD(n_components=300, random_state=42)),
    ("clf",  RandomForestClassifier(random_state=42, n_jobs=-1))
])

rf_grid = {
    "clf__n_estimators": [200, 500],
    "clf__max_depth": [None, 20, 40],
    "clf__max_features": ["sqrt", 0.3],
    "clf__min_samples_leaf": [1, 2]
}

cv = GroupKFold(n_splits=4)
rf_search = GridSearchCV(
    rf_pipe, 
    rf_grid, 
    scoring="accuracy",
    cv=cv.split(X_train, y_train, groups=g_train),
    n_jobs=-1, 
    refit=True
)


In [15]:
rf_search.fit(X_train, y_train)

print("\n=== RANDOM FOREST ===")
print(f"Najlepsze parametry: {rf_search.best_params_}")
print(f"CV accuracy: {rf_search.best_score_:.4f}")

rf_best = rf_search.best_estimator_
rf_pred = rf_best.predict(X_test)
print("\n[TEST] Accuracy:", accuracy_score(y_test, rf_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred, target_names=["truthful(0)","deceptive(1)"]))


=== RANDOM FOREST ===
Najlepsze parametry: {'clf__max_depth': None, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 1, 'clf__n_estimators': 200}
CV accuracy: 0.7391

[TEST] Accuracy: 0.78125
Confusion matrix:
 [[67 13]
 [22 58]]
              precision    recall  f1-score   support

 truthful(0)       0.75      0.84      0.79        80
deceptive(1)       0.82      0.72      0.77        80

    accuracy                           0.78       160
   macro avg       0.78      0.78      0.78       160
weighted avg       0.78      0.78      0.78       160



### XGBoost

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgb_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9)),
    ("svd", TruncatedSVD(n_components=300, random_state=42)),
    ("clf",  HistGradientBoostingClassifier(random_state=42))
])

hgb_grid = {
    "clf__learning_rate": [0.05, 0.1],
    "clf__max_depth": [None, 6, 10],
    "clf__max_leaf_nodes": [31, 63],
    "clf__l2_regularization": [0.0, 1.0],
    "clf__max_bins": [255],           
    "clf__early_stopping": [True],    
    "clf__validation_fraction": [0.1] 
}

hgb_search = GridSearchCV(
    hgb_pipe, 
    hgb_grid, 
    scoring="accuracy",
    cv=cv.split(X_train, y_train, groups=g_train),
    n_jobs=-1, 
    refit=True
)

In [17]:
hgb_search.fit(X_train, y_train)

print("\n=== HIST GRADIENT BOOSTING ===")
print(f"Najlepsze parametry: {hgb_search.best_params_}")
print(f"CV accuracy: {hgb_search.best_score_:.4f}")

hgb_best = hgb_search.best_estimator_
hgb_pred = hgb_best.predict(X_test)
print("\n[TEST] Accuracy:", accuracy_score(y_test, hgb_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, hgb_pred))
print(classification_report(y_test, hgb_pred, target_names=["truthful(0)","deceptive(1)"]))


=== HIST GRADIENT BOOSTING ===
Najlepsze parametry: {'clf__early_stopping': True, 'clf__l2_regularization': 0.0, 'clf__learning_rate': 0.1, 'clf__max_bins': 255, 'clf__max_depth': 6, 'clf__max_leaf_nodes': 31, 'clf__validation_fraction': 0.1}
CV accuracy: 0.7422

[TEST] Accuracy: 0.775
Confusion matrix:
 [[71  9]
 [27 53]]
              precision    recall  f1-score   support

 truthful(0)       0.72      0.89      0.80        80
deceptive(1)       0.85      0.66      0.75        80

    accuracy                           0.78       160
   macro avg       0.79      0.77      0.77       160
weighted avg       0.79      0.78      0.77       160

