In [80]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from statsmodels.stats.contingency_tables import mcnemar

top_feature_rows = []


In [2]:
DATA_DIR = Path("op_spam_v1.4")
neg_dir = DATA_DIR / "negative_polarity"


sources = {
    "deceptive_from_MTurk": 1, 
    "truthful_from_Web": 0
}

In [3]:
rows = []
for src, y in sources.items():
    for fold_name in sorted((neg_dir / src).rglob("fold*")):
        fold_id = int(fold_name.name[-1])
        for fp in fold_name.rglob("*.txt"):
            txt = Path(fp).read_text(encoding="utf-8", errors="ignore")
            rows.append({"text": txt, "label": y, "fold": fold_id, "path": str(fp)})

df = pd.DataFrame(rows).sample(frac=1.0, random_state=42).reset_index(drop=True)

train_mask = df["fold"].isin([1,2,3,4])
test_mask  = df["fold"] == 5
X_train, y_train, g_train = df.loc[train_mask, "text"], df.loc[train_mask, "label"], df.loc[train_mask, "fold"]
X_test,  y_test           = df.loc[test_mask,  "text"], df.loc[test_mask,  "label"]


### Multinomial Naive Bayes

In [79]:
from sklearn.naive_bayes import MultinomialNB

nb_test_preds = [] # array to store test preds for statistical test
for ngram, gramlabel in [((1,1), "UNIGRAMS"), ((1,2), "UNIGRAMS + BIGRAMS")]:
    nb_pipe = Pipeline([
        ("vect", CountVectorizer(ngram_range = ngram, min_df=2, max_df=0.9, stop_words="english")),
        ("select", SelectKBest(chi2)),
        ("clf", MultinomialNB())
    ])

    nb_grid = {
        "clf__alpha": [0.1, 0.5, 1.0, 2.0],
        "select__k": [500, 1000, 2000, "all"]
    }

    cv = GroupKFold(n_splits=4)
    nb_search = GridSearchCV(
        nb_pipe,
        nb_grid,
        scoring="accuracy",
        cv=cv.split(X_train, y_train, groups=g_train),
        n_jobs=-1,
        refit=True,
    )

    nb_search.fit(X_train, y_train)

    model_label = f"Multinomial NB ({gramlabel})"
    print(f"\n=== {model_label.upper()} ===")
    print("Best parameters:", nb_search.best_params_)
    print(f"CV accuracy: {nb_search.best_score_:.4f}")

    nb_best = nb_search.best_estimator_
    nb_pred = nb_best.predict(X_test)
    nb_test_preds.append(nb_pred)
    print("\n[TEST] Accuracy:", accuracy_score(y_test, nb_pred))
    print("Confusion matrix\n", confusion_matrix(y_test, nb_pred))
    print(classification_report(y_test, nb_pred, target_names=["truthful(0)","deceptive(1)"]))

    feature_names = nb_best.named_steps["vect"].get_feature_names_out()
    selected_features = feature_names[nb_best.named_steps["select"].get_support()]

    feature_probs = nb_best.named_steps["clf"].feature_log_prob_
    importance_deltas = feature_probs[1] - feature_probs[0]
    top_deceptive = np.argsort(importance_deltas)[-5:][::-1]
    top_truthful = np.argsort(importance_deltas)[:5]

    print("\n Top 5 truthful features:")
    for rank, i in enumerate(top_truthful, start=1):
        diff = importance_deltas[i]
        feature = selected_features[i]
        print(f" {feature}: {diff:.4f}")
        top_feature_rows.append({
            "model": model_label,
            "class": "truthful",
            "rank": rank,
            "feature": feature,
            "importance": diff
        })

    print("\n Top 5 Deceptive features:")
    for rank, i in enumerate(top_deceptive, start=1):
        diff = importance_deltas[i]
        feature = selected_features[i]
        print(f" {feature}: {diff:.4f}")
        top_feature_rows.append({
            "model": model_label,
            "class": "deceptive",
            "rank": rank,
            "feature": feature,
            "importance": diff
        })




=== MULTINOMIAL NAIVE BAYES (UNIGRAMS) ===
Best parameters: {'clf__alpha': 1.0, 'select__k': 2000}
CV accuracy: 0.8328

[TEST] Accuracy: 0.88125
Confusion matrix:
 [[68 12]
 [ 7 73]]
              precision    recall  f1-score   support

 truthful(0)       0.91      0.85      0.88        80
deceptive(1)       0.86      0.91      0.88        80

    accuracy                           0.88       160
   macro avg       0.88      0.88      0.88       160
weighted avg       0.88      0.88      0.88       160


 Top 5 truthful features:
 priceline: -3.0223)
 sofa: -2.6858)
 25: -2.6858)
 fridge: -2.6168)
 stated: -2.5427)

 Top 5 Deceptive features:
 relax: 2.7303)
 originally: 2.5872)
 luxury: 2.5678)
 settled: 2.4201)
 smell: 2.3831)

=== MULTINOMIAL NAIVE BAYES (UNIGRAMS + BIGRAMS) ===
Best parameters: {'clf__alpha': 2.0, 'select__k': 'all'}
CV accuracy: 0.8422

[TEST] Accuracy: 0.875
Confusion matrix:
 [[70 10]
 [10 70]]
              precision    recall  f1-score   support

 truthful(0

In [106]:
## Statistical comparison of Naive Bayes uni/unibigram models
#

# build contigency table for mcnemar test
nb_uni_correct = nb_test_preds[0] == y_test
nb_unibi_correct = nb_test_preds[1] == y_test
a = np.sum(nb_uni_correct & nb_unibi_correct)
b = np.sum(nb_uni_correct & ~ nb_unibi_correct)
c = np.sum(~nb_uni_correct & nb_unibi_correct)
d = np.sum(~nb_uni_correct & ~nb_unibi_correct)
table = [[a,b], [c,d]]

# perform mcnemar test
nb_mcnemar_results = mcnemar(table)

print("McNemar Test for Naive Bayes, Unigram vs Unigram+Bigram")
print(nb_mcnemar_results)

McNemar Test for Naive Bayes, Unigram vs Unigram+Bigram
pvalue      1.0
statistic   6.0


### Random Forest

In [86]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np

In [87]:
cv_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def rf_grid(vectorizer):
    pipe = Pipeline([
        ("vect", vectorizer),
        ("clf", RandomForestClassifier(
            n_jobs=-1, random_state=42, oob_score=True  # bootstrap=True by default
        ))
    ])
    param_grid = {
        # Random Forest
        "clf__n_estimators": [300, 600],
        "clf__max_features": ["sqrt", "log2", 0.2, 0.4],
        "clf__max_depth": [None, 20, 40],
        "clf__min_samples_split": [2, 5, 10],
        "clf__min_samples_leaf": [1, 2, 4],
        # (Optional) Try binary features for RF:
        # "vect__binary": [False, True],
    }
    gs = GridSearchCV(pipe, param_grid, scoring="f1", cv=cv_inner, n_jobs=-1, verbose=1)
    return gs


In [88]:
# 1) UNIGRAMS
gs_uni = rf_grid(
    CountVectorizer(ngram_range=(1,1), min_df=2, max_df=0.9, stop_words="english")
)
gs_uni.fit(X_train, y_train)

# 2) UNIGRAMS + BIGRAMS
gs_unibi = rf_grid(
    CountVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9, stop_words="english")
)
gs_unibi.fit(X_train, y_train)

print("Best (unigrams):", gs_uni.best_params_, "CV F1:", gs_uni.best_score_)
print("Best (uni+bi):", gs_unibi.best_params_, "CV F1:", gs_unibi.best_score_)

# === Final evaluation on your held-out test fold (Fold 5) ===
for label, gs in [("RF Unigrams", gs_uni), ("RF Uni+Bi", gs_unibi)]:
    y_pred = gs.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)
    print(f"{label} -> acc={acc:.3f}  prec={p:.3f}  rec={r:.3f}  f1={f1:.3f}")
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


KeyboardInterrupt: 

In [89]:
rf_uni_final = Pipeline([
    ("vect", CountVectorizer(ngram_range=(1,1), min_df=2, max_df=0.9, stop_words="english")),
    ("clf", RandomForestClassifier(
        n_estimators=600,
        max_depth=20,
        max_features="log2",
        min_samples_split=10,
        min_samples_leaf=1,
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        random_state=42
    ))
])

rf_unibi_final = Pipeline([
    ("vect", CountVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9, stop_words="english")),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        max_features="log2",
        min_samples_split=2,
        min_samples_leaf=2,
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        random_state=42
    ))
])


In [90]:
rf_uni_final.fit(X_train, y_train)
rf_unibi_final.fit(X_train, y_train)

0,1,2
,steps,"[('vect', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [95]:
# === Final evaluation on your held-out test fold (Fold 5) ===
rf_test_preds = []
for label, gs in [("RF Unigrams", rf_uni_final), ("RF Uni+Bi", rf_unibi_final)]:
    y_pred = gs.predict(X_test)
    rf_test_preds.append(y_pred)
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)
    print(f"{label} -> acc={acc:.3f}  prec={p:.3f}  rec={r:.3f}  f1={f1:.3f}")
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

RF Unigrams -> acc=0.850  prec=0.859  rec=0.838  f1=0.848
Confusion matrix:
 [[69 11]
 [13 67]]
RF Uni+Bi -> acc=0.844  prec=0.937  rec=0.738  f1=0.825
Confusion matrix:
 [[76  4]
 [21 59]]


In [92]:
feature_names = rf_uni_final.named_steps['vect'].get_feature_names_out()
importances = rf_uni_final.named_steps['clf'].feature_importances_
indices = np.argsort(importances)[-5:][::-1]
model_label = "Random Forest (Unigrams)"
print("Top 5 important features:")
for rank, i in enumerate(indices, start=1):
    feature = feature_names[i]
    importance = importances[i]
    print(f"{feature}: {importance:.4f}")
    top_feature_rows.append({
        "model": model_label,
        "class": "overall",
        "rank": rank,
        "feature": feature,
        "importance": importance
    })



Top 5 important features:
chicago: 0.0188
location: 0.0085
smell: 0.0074
decided: 0.0065
luxury: 0.0060


In [93]:
# for rf_unibi_final
feature_names = rf_unibi_final.named_steps['vect'].get_feature_names_out()
importances = rf_unibi_final.named_steps['clf'].feature_importances_
indices = np.argsort(importances)[-5:][::-1]
model_label = "Random Forest (Uni+Bi)"
print("Top 5 important features (uni+bi):")
for rank, i in enumerate(indices, start=1):
    feature = feature_names[i]
    importance = importances[i]
    print(f"{feature}: {importance:.4f}")
    top_feature_rows.append({
        "model": model_label,
        "class": "overall",
        "rank": rank,
        "feature": feature,
        "importance": importance
    })



Top 5 important features (uni+bi):
chicago: 0.0099
smell: 0.0089
hotel chicago: 0.0073
luxury: 0.0070
location: 0.0055


In [107]:
## Statistical comparison of Random Forest uni/unibigram models
#

# build contigency table for mcnemar test
rf_uni_correct = rf_test_preds[0] == y_test
rf_unibi_correct = rf_test_preds[1] == y_test
b = np.sum(rf_uni_correct & ~ rf_unibi_correct)
c = np.sum(~rf_uni_correct & rf_unibi_correct)
table = [[0,b], [c,0]]

# perform mcnemar test
rf_mcnemar_results = mcnemar(table)

print("McNemar Test for Random Forest, Unigram vs Unigram+Bigram")
print(rf_mcnemar_results)

McNemar Test for Random Forest, Unigram vs Unigram+Bigram
pvalue      1.0
statistic   8.0


### XGBoost

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV

pipe_uni = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,1), min_df=2, max_df=0.9)),
    ("clf", XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        random_state=42,
        n_jobs=-1
    ))
])

pipe_unibi = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9)),
    ("clf", XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        random_state=42,
        n_jobs=-1
    ))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {
    "clf__n_estimators": [300, 500, 700],
    "clf__learning_rate": [0.05, 0.1],
    "clf__max_depth": [3, 4, 5]
}

grid_uni = GridSearchCV(pipe_uni, param_grid, scoring="f1", cv=cv, n_jobs=-1)
grid_unibi = GridSearchCV(pipe_unibi, param_grid, scoring="f1", cv=cv, n_jobs=-1)

grid_uni.fit(X_train, y_train)
grid_unibi.fit(X_train, y_train)

print("Best F1 (unigrams):", grid_uni.best_score_)
print("Best F1 (uni+bi):", grid_unibi.best_score_)


Best F1 (unigrams): 0.8077137544817697
Best F1 (uni+bi): 0.822032370499935


In [None]:
print(grid_uni.best_params_)
print(grid_unibi.best_params_)

{'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 700}
{'clf__learning_rate': 0.05, 'clf__max_depth': 3, 'clf__n_estimators': 700}


In [101]:
pipe_uni = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,1), min_df=2, max_df=0.9)),
    ("clf", XGBClassifier(
        objective="binary:logistic",
        learning_rate=0.1,
        max_depth=5,
        n_estimators=700,
        eval_metric="logloss",
        tree_method="hist",
        random_state=42,
        n_jobs=-1
    ))
])

pipe_unibi = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9)),
    ("clf", XGBClassifier(
        objective="binary:logistic",
        learning_rate=0.05,
        max_depth=3,
        n_estimators=700,
        eval_metric="logloss",
        tree_method="hist",
        random_state=42,
        n_jobs=-1
    ))
])
pipe_uni.fit(X_train, y_train)
pipe_unibi.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [102]:
xgb_test_preds = []
for label, gs in [("XGBoost Unigrams", pipe_uni), ("XGBoost Uni+Bi", pipe_unibi)]:
    y_pred = gs.predict(X_test)
    xgb_test_preds.append(y_pred)
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)
    print(f"{label} -> acc={acc:.3f}  prec={p:.3f}  rec={r:.3f}  f1={f1:.3f}")
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

XGBoost Unigrams -> acc=0.738  prec=0.726  rec=0.762  f1=0.744
Confusion matrix:
 [[57 23]
 [19 61]]
XGBoost Uni+Bi -> acc=0.769  prec=0.759  rec=0.787  f1=0.773
Confusion matrix:
 [[60 20]
 [17 63]]


In [103]:
feature_names = pipe_uni.named_steps['tfidf'].get_feature_names_out()
importances = pipe_uni.named_steps['clf'].feature_importances_
indices = np.argsort(importances)[-5:][::-1]
model_label = "XGBoost (Unigrams)"
print("Top 5 important features:")
for rank, i in enumerate(indices, start=1):
    feature = feature_names[i]
    importance = importances[i]
    print(f"{feature}: {importance:.4f}")
    top_feature_rows.append({
        "model": model_label,
        "class": "overall",
        "rank": rank,
        "feature": feature,
        "importance": importance
    })



Top 5 important features:
east: 0.0325
looked: 0.0289
sheets: 0.0226
far: 0.0193
area: 0.0187


In [104]:
feature_names = pipe_unibi.named_steps['tfidf'].get_feature_names_out()
importances = pipe_unibi.named_steps['clf'].feature_importances_
indices = np.argsort(importances)[-5:][::-1]
model_label = "XGBoost (Uni+Bi)"
print("Top 5 important features:")
for rank, i in enumerate(indices, start=1):
    feature = feature_names[i]
    importance = importances[i]
    print(f"{feature}: {importance:.4f}")
    top_feature_rows.append({
        "model": model_label,
        "class": "overall",
        "rank": rank,
        "feature": feature,
        "importance": importance
    })



Top 5 important features:
got to: 0.0283
sheets: 0.0271
food: 0.0187
the chicago: 0.0177
to my: 0.0177


In [110]:
## Statistical comparison of XGBoost uni/unibigram models
#

# build contigency table for mcnemar test
xgb_uni_correct = xgb_test_preds[0] == y_test
xgb_unibi_correct = xgb_test_preds[1] == y_test
b = np.sum(xgb_uni_correct & ~ xgb_unibi_correct)
c = np.sum(~xgb_uni_correct & xgb_unibi_correct)
table = [[0,b], [c,0]]

# perform mcnemar test
xgb_mcnemar_results = mcnemar(table)

print("McNemar Test for XGBoost, Unigram vs Unigram+Bigram")
print(xgb_mcnemar_results)

McNemar Test for XGBoost, Unigram vs Unigram+Bigram
pvalue      0.38331031799316406
statistic   8.0


## Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression

## UNIGRAM MODEL
## 

# set up pipeline for CV
lr_pipe = Pipeline([
    ("tokenizer", CountVectorizer(max_df=0.9,  min_df=5, stop_words='english', ngram_range=(1,1))),
    ("clf", LogisticRegression())
])

# define hyperparams
C_list = np.logspace(-4, 2, 7)
lr_param_grid = {
    "clf__C": C_list 
}

# perform cv
grid = GridSearchCV(lr_pipe, lr_param_grid, scoring="f1", cv=4)
grid.fit(X_train, y_train)


# evaluate cv
results = grid.cv_results_

# print unigram results
print("f1 | C\n")
print("Results for unigram models:")
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"{mean_score:.4f}: {params['clf__C']}")

print(f"Best unigram model params: {grid.best_params_}")

f1 | C

Results for unigram models:
0.6875: 0.0001
0.7195: 0.001
0.8262: 0.01
0.8462: 0.1
0.8455: 1.0
0.8417: 10.0
0.8402: 100.0
Best unigram model params: {'clf__C': np.float64(0.1)}


In [61]:
# General performance of best model
best_pipe = grid.best_estimator_
best_tokenizer = best_pipe.named_steps['tokenizer']
best_clf = best_pipe.named_steps['clf']

lr_uni_y_pred = best_clf.predict(best_tokenizer.transform(X_test))
p, r, f1, _ = precision_recall_fscore_support(y_test, lr_uni_y_pred, average="binary", pos_label=1)
print(f"Evaluation on test set for best unigram model:" )
print(f"f1: {f1:.4f}")
print(f"accuracy: {accuracy_score(y_test, lr_uni_y_pred):.4f}")
print(f"precision: {p:.4f}")
print(f"recall: {r:.4f}")

print("\nConfusion matrix:\n", confusion_matrix(y_test, lr_uni_y_pred))

Evaluation on test set for best unigram model:
f1: 0.8312
accuracy: 0.8375
precision: 0.8649
recall: 0.8000

Confusion matrix:
 [[70 10]
 [16 64]]


In [62]:
# top 5 features
feature_names = best_tokenizer.get_feature_names_out()
feature_importances = best_clf.coef_[0]
indices = np.argsort(np.abs(feature_importances))[-10:][::-1] # sort top 5 by absolute value of feature weight

print("Top 10 features unigram model:")
for i in indices:
    print(f"{feature_names[i]}: {feature_importances[i]:.4f}")

Top 10 features unigram model:
chicago: 0.7273
location: -0.4487
star: -0.4486
finally: 0.4292
luxury: 0.4211
recently: 0.4113
floor: -0.3994
great: -0.3969
decided: 0.3823
walk: -0.3460


In [63]:
## UNIGRAM+BIGRAM MODEL
## 

# set up pipeline for CV
lr_pipe = Pipeline([
    ("tokenizer", CountVectorizer(max_df=0.9,  min_df=5, stop_words='english', ngram_range=(1,2))), # <-- unigram+bigram
    ("clf", LogisticRegression())
])

# define hyperparams
C_list = np.logspace(-4, 2, 7)
lr_param_grid = {
    "clf__C": C_list
}

# perform cv
grid = GridSearchCV(lr_pipe, lr_param_grid, scoring="f1", cv=4)
grid.fit(X_train, y_train)

# evaluate cv
results = grid.cv_results_

# print unigram results
print("f1 | C\n")
print("Results for unigram+bigram models:")
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"{mean_score:.4f}: {params['clf__C']}")

print(f"Best unigram+bigram model params: {grid.best_params_}")


# save the CV scores for t-test later
best_index = grid.best_index_
n_splits = len([k for k in grid.cv_results_.keys() if k.startswith('split') and k.endswith('_test_score')])
lr_unibi_scores = [grid.cv_results_[f'split{i}_test_score'][best_index] for i in range(n_splits)]

f1 | C

Results for unigram+bigram models:
0.6821: 0.0001
0.7278: 0.001
0.8351: 0.01
0.8540: 0.1
0.8517: 1.0
0.8548: 10.0
0.8438: 100.0
Best unigram+bigram model params: {'clf__C': np.float64(10.0)}


In [65]:
# General performance of best model
best_pipe = grid.best_estimator_
best_tokenizer = best_pipe.named_steps['tokenizer']
best_clf = best_pipe.named_steps['clf']

lr_unibi_y_pred = best_clf.predict(best_tokenizer.transform(X_test))
p, r, f1, _ = precision_recall_fscore_support(y_test, lr_unibi_y_pred, average="binary", pos_label=1)
print(f"Evaluation on test set for best unigram+bigram model:" )
print(f"f1: {f1:.4f}")
print(f"accuracy: {accuracy_score(y_test, lr_unibi_y_pred):.4f}")
print(f"precision: {p:.4f}")
print(f"recall: {r:.4f}")

print("\nConfusion matrix:\n", confusion_matrix(y_test, lr_unibi_y_pred))

Evaluation on test set for best unigram+bigram model:
f1: 0.8182
accuracy: 0.8250
precision: 0.8514
recall: 0.7875

Confusion matrix:
 [[69 11]
 [17 63]]


In [66]:
feature_names = best_tokenizer.get_feature_names_out()
feature_importances = best_clf.coef_[0]
indices = np.argsort(np.abs(feature_importances))[-10:][::-1] # sort top 5 by absolute value of feature weight

print("Top 10 features unigram+bigram model:")
for i in indices:
    print(f"{feature_names[i]}: {feature_importances[i]:.4f}")

Top 10 features unigram+bigram model:
star: -1.4659
chicago: 1.4299
recently: 1.2207
concierge: -1.2200
location: -1.2193
luxury: 1.1883
decided: 1.1588
floor: -1.1126
tell: -1.0944
hotel chicago: 1.0869


In [111]:
## Statistical comparison of logreg uni/unibigram models
#

# build contigency table for mcnemar test
lr_uni_correct = lr_uni_y_pred == y_test
lr_unibi_correct = lr_unibi_y_pred == y_test
b = np.sum(lr_uni_correct & ~lr_unibi_correct)
c = np.sum(~lr_uni_correct & lr_unibi_correct)
table = [[0,b], [c,0]]

# perform mcnemar test
lr_mcnemar_results = mcnemar(table)

print("McNemar Test for Logistic Regression, Unigram vs Unigram+Bigram")
print(lr_mcnemar_results)

McNemar Test for Logistic Regression, Unigram vs Unigram+Bigram
pvalue      0.7744140625
statistic   5.0


## Classification Tree

In [68]:
from sklearn.tree import DecisionTreeClassifier

## UNIGRAM CLASSIFICATION TREE MODEL
#

# set up pipeline for CV
ct_pipe = Pipeline([
    ("tokenizer", CountVectorizer(max_df=0.9,  min_df=5, stop_words='english', ngram_range=(1,1))),
    ("clf", DecisionTreeClassifier(min_samples_split=2)) # <-- fixed to 2 due to underperforming
])

lr_param_grid = {
    "clf__max_depth": np.arange(1, 15, 1),
    "clf__criterion": ['gini', 'entropy'],
    "clf__ccp_alpha": [0.0, 0.0001, 0.0005, 0.001, 0.005, 0.01],
}

# perform cv
grid = GridSearchCV(ct_pipe, lr_param_grid, scoring="f1", cv=4, error_score='raise')
grid.fit(X_train, y_train)

# evaluate cv
results = grid.cv_results_

print("Best params:", grid.best_params_)
print("Best f1:", grid.best_score_)

Best params: {'clf__ccp_alpha': 0.0001, 'clf__criterion': 'gini', 'clf__max_depth': np.int64(5)}
Best f1: 0.7368465094667834


In [69]:
# General performance of best model
best_pipe = grid.best_estimator_
best_tokenizer = best_pipe.named_steps['tokenizer']
best_clf = best_pipe.named_steps['clf']

ct_uni_y_pred = best_clf.predict(best_tokenizer.transform(X_test))
p, r, f1, _ = precision_recall_fscore_support(y_test, ct_uni_y_pred, average="binary", pos_label=1)
print(f"Evaluation on test set for best unigram model:" )
print(f"f1: {f1:.4f}")
print(f"accuracy: {accuracy_score(y_test, ct_uni_y_pred):.4f}")
print(f"precision: {p:.4f}")
print(f"recall: {r:.4f}")

print("\nConfusion matrix:\n", confusion_matrix(y_test, ct_uni_y_pred))

Evaluation on test set for best unigram model:
f1: 0.6197
accuracy: 0.6625
precision: 0.7097
recall: 0.5500

Confusion matrix:
 [[62 18]
 [36 44]]


In [70]:
# top 5 features
feature_names = best_tokenizer.get_feature_names_out()
feature_importances = best_clf.feature_importances_
indices = np.argsort(feature_importances)[-5:][::-1]

# decision trees dont differentiate between positive/negative splits, so +/- features is inapplicable here, 5 is enough
model_label = "Classification Tree (Unigrams)"
print("Top 5 features unigram model:")
for rank, i in enumerate(indices, start=1):
    feature = feature_names[i]
    importance = feature_importances[i]
    print(f"{feature}: {importance:.4f}")
    top_feature_rows.append({
        "model": model_label,
        "class": "overall",
        "rank": rank,
        "feature": feature,
        "importance": importance
    })



Top 5 features unigram model:
chicago: 0.4015
location: 0.0824
cool: 0.0714
sheets: 0.0670
luxury: 0.0614


In [71]:
## UNIGRAM + BIGRAM
#

# set up pipeline for CV
ct_pipe = Pipeline([
    ("tokenizer", CountVectorizer(max_df=0.9,  min_df=5, stop_words='english', ngram_range=(1,2))), # <-- unigram+bigram
    ("clf", DecisionTreeClassifier(min_samples_split=2))
])

lr_param_grid = {
    "clf__max_depth": np.arange(1, 15, 1),
    "clf__criterion": ['gini', 'entropy'],
    "clf__ccp_alpha": [0.0, 0.0001, 0.0005, 0.001, 0.005, 0.01],
}

# perform cv
grid = GridSearchCV(ct_pipe, lr_param_grid, scoring="f1", cv=4, error_score='raise')
grid.fit(X_train, y_train)

# evaluate cv
results = grid.cv_results_

print("Best params:", grid.best_params_)
print("Best f1:", grid.best_score_)

Best params: {'clf__ccp_alpha': 0.0005, 'clf__criterion': 'entropy', 'clf__max_depth': np.int64(9)}
Best f1: 0.7350688078486556


In [72]:
# General performance of best model
best_pipe = grid.best_estimator_
best_tokenizer = best_pipe.named_steps['tokenizer']
best_clf = best_pipe.named_steps['clf']

ct_unibi_y_pred = best_clf.predict(best_tokenizer.transform(X_test))
p, r, f1, _ = precision_recall_fscore_support(y_test, ct_unibi_y_pred, average="binary", pos_label=1)
print(f"Evaluation on test set for best unigram+bigram model:" )
print(f"f1: {f1:.4f}")
print(f"accuracy: {accuracy_score(y_test, ct_unibi_y_pred):.4f}")
print(f"precision: {p:.4f}")
print(f"recall: {r:.4f}")

print("\nConfusion matrix:\n", confusion_matrix(y_test, ct_unibi_y_pred))

Evaluation on test set for best unigram+bigram model:
f1: 0.6711
accuracy: 0.6875
precision: 0.7083
recall: 0.6375

Confusion matrix:
 [[59 21]
 [29 51]]


In [73]:
# top 5 features
feature_names = best_tokenizer.get_feature_names_out()
feature_importances = best_clf.feature_importances_
indices = np.argsort(feature_importances)[-5:][::-1]

model_label = "Classification Tree (Uni+Bi)"
print("Top 5 features unigram+bigram model:")
for rank, i in enumerate(indices, start=1):
    feature = feature_names[i]
    importance = feature_importances[i]
    print(f"{feature}: {importance:.4f}")
    top_feature_rows.append({
        "model": model_label,
        "class": "overall",
        "rank": rank,
        "feature": feature,
        "importance": importance
    })



Top 5 features unigram+bigram model:
chicago: 0.2488
location: 0.0853
luxury: 0.0567
open: 0.0454
sheets: 0.0445


In [112]:
## Statistical comparison of classification tree uni/unibigram models
#

# build contigency table for mcnemar test
ct_uni_correct = ct_uni_y_pred == y_test
ct_unibi_correct = ct_unibi_y_pred == y_test
b = np.sum(ct_uni_correct & ~ ct_unibi_correct)
c = np.sum(~ct_uni_correct & ct_unibi_correct)
table = [[0,b], [c,0]]

# perform mcnemar test
lr_mcnemar_results = mcnemar(table)

print("McNemar Test for Classification Tree, Unigram vs Unigram+Bigram")
print(lr_mcnemar_results)

McNemar Test for Classification Tree, Unigram vs Unigram+Bigram
pvalue      0.5412561893463135
statistic   10.0


In [None]:
top_features_df = (
    pd.DataFrame(top_feature_rows)
    .sort_values(['model', 'class', 'rank'])
    .reset_index(drop=True)
)

print('Top 5 feature summary per model:')
top_features_df
