# 02 - ML TFIDF

In this notebook we show the code that extended our baseline models by including TFIDF in the features.
The book explains how TFIDF is a simple vector representation of the content of a text block.
Prior to semantic embeddings with Neural Networks this was a reliable and useful way to capture some elements of semantics for our machine learning models to work with.


The corresponding python script for this notebook is:
* [CaseStudy_4.1_02-03b.py TFIDF](CaseStudy_4.1_02-03b.py)


In [None]:
import pandas as pd
import numpy as np

# BASELINE ML MODELS
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

# SUPPORT MODULES
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = pd.read_csv("data/complete_with_features.csv")

features = ['text']

In [None]:
train = df[df["RANDOM"]<0.8]
test = df[df["RANDOM"]>=0.8]

X_train = train.loc[:,features]
y_train = train.loc[:,"generated"]
X_test = test.loc[:,features]
y_test = test.loc[:,"generated"]

In [None]:
nb = ComplementNB()
lr = LogisticRegression(random_state=0)
xt = ExtraTreesClassifier()

tfidf = make_pipeline(
   TfidfVectorizer(max_features=200, stop_words='english')
)

preprocessor = ColumnTransformer(
    transformers=[
         ("text", tfidf, 'text'),
    ]
)

Transform the text column into the TFIDF features
This is done on the training only, so the test data does not influence the fitting of the transformation process

In [None]:
feats = preprocessor.fit_transform(X_train)

In [None]:
nb.fit(feats, y_train)
lr.fit(feats, y_train)
xt.fit(feats, y_train)

nb_model = Pipeline(steps=[
   ('tfidf', preprocessor),
   ('nb', nb )
])
lr_model = Pipeline(steps=[
   ('tfidf', preprocessor),
   ('lr', lr )
])
xt_model = Pipeline(steps=[
   ('tfidf', preprocessor),
   ('xt', xt )
])

In [None]:
## Compile a results dataset

results = pd.DataFrame(columns=["Model", "AUC", "Precision", "Recall"])

In [None]:
# Metrics for the Naive Bayes Model
y_pred = nb_model.predict(X_test)
recall = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
temp2 = nb_model.predict_proba(X_test)
auc = roc_auc_score(y_test, temp2[:,1])
record = {"Model":"NaiveBayes (TFIDF)", "AUC": auc, "Precision":prec, "Recall":recall}
results = pd.concat([results, pd.DataFrame([record])], ignore_index=True)

In [None]:
# Metrics for the Logistic Regression Model
y_pred = lr_model.predict(X_test)
recall = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
temp2 = lr_model.predict_proba(X_test)
auc = roc_auc_score(y_test, temp2[:,1])
record = {"Model":"Logistic Regression (TFIDF)", "AUC": auc, "Precision":prec, "Recall":recall}
results = pd.concat([results, pd.DataFrame([record])], ignore_index=True)

In [None]:
# Metrics for the Extra Trees Model
y_pred = xt_model.predict(X_test)
recall = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
temp2 = xt_model.predict_proba(X_test)
auc = roc_auc_score(y_test, temp2[:,1])
record = {"Model":"Extra Trees (TFIDF)", "AUC": auc, "Precision":prec, "Recall":recall}
results = pd.concat([results, pd.DataFrame([record])], ignore_index=True)

In [15]:
results = results.round(3)

# Display Results DataFrame as a Markdown Table
markdown_table = results.to_markdown(index=False)
print(markdown_table)

| Model                       |   AUC |   Precision |   Recall |
|:----------------------------|------:|------------:|---------:|
| NaiveBayes (TFIDF)          | 0.935 |       0.856 |    0.808 |
| Logistic Regression (TFIDF) | 0.984 |       0.944 |    0.908 |
| Extra Trees (TFIDF)         | 0.996 |       0.982 |    0.95  |
