# Baseline Models

This notebook assumes the data have been cleaned and split by the script `scripts/preprocess_data.py`.

Read the processed data.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("../data/processed/train.csv")
test_df = pd.read_csv("../data/processed/test.csv")

X_train, y_train = train_df["text"], train_df["label"]
X_test, y_test = test_df["text"], test_df["label"]

Train the model and make predictions on the test set.

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [3]:
from sklearn.metrics import precision_score, recall_score

def fit_and_evaluate(model):
    """Fit and evaluate each model."""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = {
        "Precision": precision_score(y_test, y_pred, average="micro"),
        "Recall": recall_score(y_test, y_pred, average="micro"),
    }
    return metrics

In [4]:
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

model_dict = {
    "multinomial-navie-bayes": MultinomialNB(),
    "logistic-regression": LogisticRegression(),
    "k-nearest-neighbors": OneVsRestClassifier(KNeighborsClassifier()),
    "random-forest": OneVsRestClassifier(RandomForestClassifier()),
    "gradient-boosting-machine": OneVsRestClassifier(GradientBoostingClassifier()),
    "support-vector-machine": OneVsRestClassifier(LinearSVC()),
}

performance = {}
for model_name, model_class in model_dict.items():
    performance[model_name] = fit_and_evaluate(model_class)
print(json.dumps(performance, indent=2))

{
  "multinomial-navie-bayes": {
    "Precision": 0.7768595041322314,
    "Recall": 0.7768595041322314
  },
  "logistic-regression": {
    "Precision": 0.8471074380165289,
    "Recall": 0.8471074380165289
  },
  "k-nearest-neighbors": {
    "Precision": 0.756198347107438,
    "Recall": 0.756198347107438
  },
  "random-forest": {
    "Precision": 0.8099173553719008,
    "Recall": 0.8099173553719008
  },
  "gradient-boosting-machine": {
    "Precision": 0.8057851239669421,
    "Recall": 0.8057851239669421
  },
  "support-vector-machine": {
    "Precision": 0.8181818181818182,
    "Recall": 0.8181818181818182
  }
}
