In [51]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [52]:
DATA_DIR = Path("op_spam_v1.4")
neg_dir = DATA_DIR / "negative_polarity"

sources = {
    "deceptive_from_MTurk": 1, 
    "truthful_from_Web": 0
}

In [53]:
rows = []
for src, y in sources.items():
    for fold_name in sorted((neg_dir / src).rglob("fold*")):
        fold_id = int(fold_name.name[-1])
        for fp in fold_name.rglob("*.txt"):
            txt = Path(fp).read_text(encoding="utf-8", errors="ignore")
            rows.append({"text": txt, "label": y, "fold": fold_id, "path": str(fp)})

df = pd.DataFrame(rows).sample(frac=1.0, random_state=42).reset_index(drop=True)

train_mask = df["fold"].isin([1,2,3,4])
test_mask  = df["fold"] == 5
X_train, y_train, g_train = df.loc[train_mask, "text"], df.loc[train_mask, "label"], df.loc[train_mask, "fold"]
X_test,  y_test           = df.loc[test_mask,  "text"], df.loc[test_mask,  "label"]


In [54]:
# create unigrams
unigram_tokenizer = CountVectorizer(ngram_range=(1,1))
X_train_uni = unigram_tokenizer.fit_transform(X_train) 

# print some unigram information
features = unigram_tokenizer.get_feature_names_out()
counts = X_train_uni.sum(axis=0).A1
feature_counts = pd.DataFrame({
    'feature': features,
    'count': counts
}).sort_values(by='count', ascending=False)
print(feature_counts)
print("Number of features (vocabulary size):", X_train_uni.shape[1])
print("Number of features with 1 occurrence:", np.sum(counts == 1))
print("Number of features with counts <5:", np.sum(counts < 5))

       feature  count
6111       the   7542
6209        to   3525
426        and   3340
6669       was   2812
3173        in   1945
...        ...    ...
2177   enemies      1
2176  enduring      1
6878       yrs      1
6877     youth      1
6875     yours      1

[6885 rows x 2 columns]
Number of features (vocabulary size): 6885
Number of features with 1 occurrence: 3208
Number of features with counts <5: 5037


The most common features appear to be mostly uninformative stop words (the, to, in, etc). Furthermore, since more than 70% of the features have low counts (<5), and almost 50% have a count of exactly 1, we decide to prune the vocabulary using the following methods:
- Removing common stopwords
- Removing all features with frequency >90%
- Removing all features with counts <5

In [55]:
# create new unigram
max_count = 0.9
min_count = 5
stop_words = 'english'
unigram_tokenizer = CountVectorizer(ngram_range=(1,1), max_df=max_count,  min_df=min_count, stop_words=stop_words)
X_train_uni = unigram_tokenizer.fit_transform(X_train) 

# print some unigram information again
features = unigram_tokenizer.get_feature_names_out()
counts = X_train_uni.sum(axis=0).A1
feature_counts = pd.DataFrame({
    'feature': features,
    'count': counts
}).sort_values(by='count', ascending=False)
print(feature_counts)
print("Number of features (vocabulary size):", X_train_uni.shape[1])

       feature  count
1150      room   1490
676      hotel   1378
248    chicago    535
1294      stay    531
1190   service    387
...        ...    ...
894      nasty      5
1477    washed      5
86      appear      5
841   meantime      5
884       mood      5

[1535 rows x 2 columns]
Number of features (vocabulary size): 1535


## Unigram model

In [56]:
## UNIGRAM MODEL
## 

# set up pipeline for CV
lr_pipe = Pipeline([
    ("tokenizer", CountVectorizer(max_df=0.9,  min_df=5, stop_words='english', ngram_range=(1,1))),
    ("clf", LogisticRegression())
])

# define hyperparams
C_list = np.logspace(-4, 2, 7)
lr_param_grid = {
    "clf__C": C_list 
}

# perform cv
grid = GridSearchCV(lr_pipe, lr_param_grid, scoring="f1", cv=4)
grid.fit(X_train, y_train)


# evaluate cv
results = grid.cv_results_

# print unigram results
print("f1 | C\n")
print("Results for unigram models:")
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"{mean_score:.4f}: {params['clf__C']}")

print(f"Best unigram model params: {grid.best_params_}")

f1 | C

Results for unigram models:
0.6875: 0.0001
0.7195: 0.001
0.8262: 0.01
0.8462: 0.1
0.8455: 1.0
0.8417: 10.0
0.8402: 100.0
Best unigram model params: {'clf__C': np.float64(0.1)}


In [57]:
# General performance of best model
best_pipe = grid.best_estimator_
best_tokenizer = best_pipe.named_steps['tokenizer']
best_clf = best_pipe.named_steps['clf']

y_pred = best_clf.predict(best_tokenizer.transform(X_test))
p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)
print(f"Evaluation on test set for best unigram model:" )
print(f"f1: {f1:.4f}")
print(f"accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"precision: {p:.4f}")
print(f"recall: {r:.4f}")

print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

Evaluation on test set for best unigram model:
f1: 0.8312
accuracy: 0.8375
precision: 0.8649
recall: 0.8000

Confusion matrix:
 [[70 10]
 [16 64]]


In [58]:
# top 5 features
feature_names = best_tokenizer.get_feature_names_out()
feature_importances = best_clf.coef_[0]
indices = np.argsort(np.abs(feature_importances))[-5:][::-1] # sort top 5 by absolute value of feature weight

print("Top 5 features unigram model:")
for i in indices:
    print(f"{feature_names[i]}: {feature_importances[i]:.4f}")

Top 5 features unigram model:
chicago: 0.7273
location: -0.4487
star: -0.4486
finally: 0.4292
luxury: 0.4211


## Unigram + bigram model

In [59]:
## UNIGRAM+BIGRAM MODEL
## 

# set up pipeline for CV
lr_pipe = Pipeline([
    ("tokenizer", CountVectorizer(max_df=0.9,  min_df=5, stop_words='english', ngram_range=(1,2))), # <-- unigram+bigram
    ("clf", LogisticRegression())
])

# define hyperparams
C_list = np.logspace(-4, 2, 7)
lr_param_grid = {
    "clf__C": C_list
}

# perform cv
grid = GridSearchCV(lr_pipe, lr_param_grid, scoring="f1", cv=4)
grid.fit(X_train, y_train)

# evaluate cv
results = grid.cv_results_

# print unigram results
print("f1 | C\n")
print("Results for unigram+bigram models:")
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"{mean_score:.4f}: {params['clf__C']}")

print(f"Best unigram+bigram model params: {grid.best_params_}")

f1 | C

Results for unigram+bigram models:
0.6821: 0.0001
0.7278: 0.001
0.8351: 0.01
0.8540: 0.1
0.8517: 1.0
0.8548: 10.0
0.8438: 100.0
Best unigram+bigram model params: {'clf__C': np.float64(10.0)}


In [60]:
# General performance of best model
best_pipe = grid.best_estimator_
best_tokenizer = best_pipe.named_steps['tokenizer']
best_clf = best_pipe.named_steps['clf']

y_pred = best_clf.predict(best_tokenizer.transform(X_test))
p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)
print(f"Evaluation on test set for best unigram model:" )
print(f"f1: {f1:.4f}")
print(f"accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"precision: {p:.4f}")
print(f"recall: {r:.4f}")

print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

Evaluation on test set for best unigram model:
f1: 0.8182
accuracy: 0.8250
precision: 0.8514
recall: 0.7875

Confusion matrix:
 [[69 11]
 [17 63]]


In [61]:
feature_names = best_tokenizer.get_feature_names_out()
feature_importances = best_clf.coef_[0]
indices = np.argsort(np.abs(feature_importances))[-5:][::-1] # sort top 5 by absolute value of feature weight

print("Top 5 features unigram model:")
for i in indices:
    print(f"{feature_names[i]}: {feature_importances[i]:.4f}")

Top 5 features unigram model:
star: -1.4659
chicago: 1.4299
recently: 1.2207
concierge: -1.2200
location: -1.2193
