# Baseline Model

In this notebook, I trained a Navie Bayes classifier with TF-IDF features as a baseline model.

Read the raw data.

In [23]:
import pandas as pd

df = pd.read_csv("../data/raw/reddit_posts.csv")

In [24]:
df.fillna("", inplace=True)

Convert labels to integers.

In [25]:
#df["label"] = df["subreddit_name"].map({"learnmachinelearning": 0, "MachineLearning": 1})
df["label"] = df["subreddit_name"].map({"datascience": 0, "statistics": 1, "MachineLearning": 2})

In [26]:
import re
import string


def process_text(text: str) -> str:
    text = re.sub(r"http\S+", "", text)
    # Replace punctuation with space
    translator = str.maketrans(string.punctuation, " " * len(string.punctuation))
    text = text.translate(translator)
    # Transform multiple spaces and \n to a single space
    text = re.sub(r"\s{1,}", " ", text)
    # Strip white spaces at the beginning and at the end
    text = text.strip()
    # Transform to lowercase
    text = text.lower()
    return text

Clean the text.

In [27]:
df["title"] = df["title"].map(lambda x: re.sub(r"\[[A-Z]\]", "", x))
df["text"] = df["title"] + " " + df["selftext"]
df["text"] = df["text"].map(process_text)
df = df[~(df["text"] == "")]

In [28]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1, stratify=df["label"])
X_train, y_train = train_df["text"], train_df["label"]
X_test, y_test = test_df["text"], test_df["label"]

Train the model and make predictions on the test set.

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

model = make_pipeline(
    TfidfVectorizer(min_df=5),
    MultinomialNB()
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

{'Precision': 0.8059701492537313, 'Recall': 0.8059701492537313}


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [31]:
def fit_and_evaluate(model):
    """Fit and evaluate each model."""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = {
        "Precision": precision_score(y_test, y_pred, average="micro"),
        "Recall": recall_score(y_test, y_pred, average="micro"),
    }
    return metrics

In [32]:
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_fscore_support

model_dict = {
    "multinomial-navie-bayes": MultinomialNB(),
    "logistic-regression": OneVsRestClassifier(LogisticRegression()),
    "k-nearest-neighbors": OneVsRestClassifier(KNeighborsClassifier()),
    "random-forest": OneVsRestClassifier(RandomForestClassifier()),
    "gradient-boosting-machine": OneVsRestClassifier(GradientBoostingClassifier()),
    "support-vector-machine": OneVsRestClassifier(LinearSVC()),
}

performance = {}
for model_name, model_class in model_dict.items():
    performance[model_name] = fit_and_evaluate(model_class)
print(json.dumps(performance, indent=2))

{
  "multinomial-navie-bayes": {
    "Precision": 0.8134328358208955,
    "Recall": 0.8134328358208955
  },
  "logistic-regression": {
    "Precision": 0.8097014925373134,
    "Recall": 0.8097014925373134
  },
  "k-nearest-neighbors": {
    "Precision": 0.7350746268656716,
    "Recall": 0.7350746268656716
  },
  "random-forest": {
    "Precision": 0.8097014925373134,
    "Recall": 0.8097014925373134
  },
  "gradient-boosting-machine": {
    "Precision": 0.7686567164179104,
    "Recall": 0.7686567164179104
  },
  "support-vector-machine": {
    "Precision": 0.8097014925373134,
    "Recall": 0.8097014925373134
  }
}
