In [5]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

In [6]:
DATA_DIR = Path("op_spam_v1.4")
neg_dir = DATA_DIR / "negative_polarity"

sources = {
    "deceptive_from_MTurk": 1, 
    "truthful_from_Web": 0
}

In [7]:
rows = []
for src, y in sources.items():
    for fold_name in sorted((neg_dir / src).rglob("fold*")):
        fold_id = int(fold_name.name[-1])
        for fp in fold_name.rglob("*.txt"):
            txt = Path(fp).read_text(encoding="utf-8", errors="ignore")
            rows.append({"text": txt, "label": y, "fold": fold_id, "path": str(fp)})

df = pd.DataFrame(rows).sample(frac=1.0, random_state=42).reset_index(drop=True)

train_mask = df["fold"].isin([1,2,3,4])
test_mask  = df["fold"] == 5
X_train, y_train, g_train = df.loc[train_mask, "text"], df.loc[train_mask, "label"], df.loc[train_mask, "fold"]
X_test,  y_test           = df.loc[test_mask,  "text"], df.loc[test_mask,  "label"]


In [17]:
# create new unigram
max_count = 0.9
min_count = 0.1
stop_words = 'english'
unigram_tokenizer = CountVectorizer(ngram_range=(1,1), max_df=max_count,  min_df=min_count, stop_words=stop_words)
X_train_uni = unigram_tokenizer.fit_transform(X_train) 

# print some unigram information again
features = unigram_tokenizer.get_feature_names_out()
counts = X_train_uni.sum(axis=0).A1
feature_counts = pd.DataFrame({
    'feature': features,
    'count': counts
}).sort_values(by='count', ascending=False)
print(feature_counts)
print("Number of features (vocabulary size):", X_train_uni.shape[1])

      feature  count
67       room   1490
39      hotel   1378
17    chicago    535
76       stay    531
72    service    387
..        ...    ...
20    decided     73
64  recommend     72
58        pay     72
4        away     70
57    overall     70

[94 rows x 2 columns]
Number of features (vocabulary size): 94


## Unigram model

In [None]:
## UNIGRAM CLASSIFICATION TREE MODEL
#

# set up pipeline for CV
ct_pipe = Pipeline([
    ("tokenizer", CountVectorizer(max_df=0.9,  min_df=5, stop_words='english', ngram_range=(1,1))),
    ("clf", DecisionTreeClassifier())
])

# define hyperparams
lr_param_grid = {
    "clf__max_depth": [3,5,7],
    'clf__min_samples_leaf': [1,2],
    'clf__min_samples_split': [2,5],
    'clf__max_features': ['sqrt', 'log2'],  # given the massive vocabulary, this seems okay
    "clf__criterion": ['gini', 'entropy', 'log_loss'],
    "clf__ccp_alpha": np.arange(0.01, 0.11, 0.01)   
}

# perform cv
grid = GridSearchCV(ct_pipe, lr_param_grid, scoring="f1", cv=4, error_score='raise')
grid.fit(X_train, y_train)

# evaluate cv
results = grid.cv_results_

print("Best params:", grid.best_params_)
print("Best f1:", grid.best_score_)

{'clf__ccp_alpha': np.float64(0.01), 'clf__criterion': 'gini', 'clf__max_depth': 7, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5}
0.7085971528771962


In [31]:
# General performance of best model
best_pipe = grid.best_estimator_
best_tokenizer = best_pipe.named_steps['tokenizer']
best_clf = best_pipe.named_steps['clf']

y_pred = best_clf.predict(best_tokenizer.transform(X_test))
p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)
print(f"Evaluation on test set for best unigram model:" )
print(f"f1: {f1:.4f}")
print(f"accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"precision: {p:.4f}")
print(f"recall: {r:.4f}")

print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

Evaluation on test set for best unigram model:
f1: 0.6309
accuracy: 0.6562
precision: 0.6812
recall: 0.5875

Confusion matrix:
 [[58 22]
 [33 47]]


In [32]:
# top 5 features
feature_names = best_tokenizer.get_feature_names_out()
feature_importances = best_clf.feature_importances_
indices = np.argsort(feature_importances)[-5:][::-1]

print("Top 5 features unigram model:")
for i in indices:
    print(f"{feature_names[i]}: {feature_importances[i]:.4f}")

Top 5 features unigram model:
chicago: 0.7129
windows: 0.0948
michigan: 0.0818
stated: 0.0669
peeling: 0.0435


Performance is pretty terrible, however the Chicago feature pops up again as an important one. Maybe an indication that the model is learning, but terrible for the task. Pruning the vocabulary more did not result in better accuracy either (in fact the opposite).

## Unigram + bigram model

In [47]:
## UNIGRAM + BIGRAM
#

# set up pipeline for CV
ct_pipe = Pipeline([
    ("tokenizer", CountVectorizer(max_df=0.9,  min_df=5, stop_words='english', ngram_range=(1,2))), # <-- unigram+bigram
    ("clf", DecisionTreeClassifier())
])

# define hyperparams
lr_param_grid = {
    "clf__max_depth": [3,5,7],
    'clf__min_samples_leaf': [1,2],
    'clf__min_samples_split': [2,5],
    'clf__max_features': ['sqrt', 'log2'],  # given the massive vocabulary, this seems okay
    "clf__criterion": ['gini', 'entropy', 'log_loss'],
    "clf__ccp_alpha": np.arange(0.01, 0.11, 0.01)   
}

# perform cv
grid = GridSearchCV(ct_pipe, lr_param_grid, scoring="f1", cv=4, error_score='raise')
grid.fit(X_train, y_train)

# evaluate cv
results = grid.cv_results_

print("Best params:", grid.best_params_)
print("Best f1:", grid.best_score_)

Best params: {'clf__ccp_alpha': np.float64(0.01), 'clf__criterion': 'gini', 'clf__max_depth': 7, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5}
Best f1: 0.6232993617525697


In [48]:
# General performance of best model
best_pipe = grid.best_estimator_
best_tokenizer = best_pipe.named_steps['tokenizer']
best_clf = best_pipe.named_steps['clf']

y_pred = best_clf.predict(best_tokenizer.transform(X_test))
p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)
print(f"Evaluation on test set for best unigram model:" )
print(f"f1: {f1:.4f}")
print(f"accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"precision: {p:.4f}")
print(f"recall: {r:.4f}")

print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

Evaluation on test set for best unigram model:
f1: 0.6731
accuracy: 0.5750
precision: 0.5469
recall: 0.8750

Confusion matrix:
 [[22 58]
 [10 70]]


In [49]:
# top 5 features
feature_names = best_tokenizer.get_feature_names_out()
feature_importances = best_clf.feature_importances_
indices = np.argsort(feature_importances)[-5:][::-1]

print("Top 5 features unigram model:")
for i in indices:
    print(f"{feature_names[i]}: {feature_importances[i]:.4f}")

Top 5 features unigram model:
location: 0.8265
hilton chicago: 0.1735
young: 0.0000
yes: 0.0000
yelling: 0.0000


This does look wrong.