# Training a Multinomial Naive Bayes Classifier. 

In [None]:
# general
import pickle
import os
import warnings
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm

# preprocessing
import nltk
from nltk.corpus import stopwords
import re

# model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# mlflow
import mlflow
import mlflow.sklearn

# evaluation
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [22]:
def text_preprocessing(text):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Remove "@name"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    text = text.lower()
    # Change 't to 'not'
    text = re.sub(r"\'t", " not", text)
    # Isolate and remove punctuations except '?'
    text = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', text)
    text = re.sub(r'[^\w\s\?]', ' ', text)
    # Remove some special characters
    text = re.sub(r'([\;\:\|•«\n])', ' ', text)
    # Remove stopwords except 'not' and 'can'
    text = " ".join([word for word in text.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def eval_metrics(actual, pred):
    """Score predictions."""
    acc = accuracy_score(actual, pred)
    f1_score = f1_score(actual, pred, average='macro')

    return acc, f1_score

def get_roc_auc(actual, pred):
    """Get AUC score."""
    auc = roc_auc_score(actual, pred, average='macro', multi_class='ovr')

    return auc

def train(X_train='processed_data/X_train.csv',
            y_train='processed_data/y_train.csv', 
            X_val='processed_data/X_val.csv', 
y_val='processed_data/y_val.csv',
alpha=1):
    
    import logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read in data
    try:
        train_X = pd.read_csv(X_train)
        train_y = pd.read_csv(y_train)
        val_X = pd.read_csv(X_val)
        val_y = pd.read_csv(y_val)
    except Exception as e:
        logger.exception(
            "Unable to load datasets. Error: %s", e)

    # preprocess data
    tqdm.pandas(desc="Preprocessing.")
    train_X_preprocessed = train_X.review_full.progress_apply(text_preprocessing)
    val_X_preprocessed = val_X.review_full.progress_apply(text_preprocessing)

    # calculate tfidf vectors
    print("Calculating TFIDF vectors.")
    tf_idf = TfidfVectorizer(ngram_range=(1, 3),
                            binary=True,
                            smooth_idf=False)
    train_X_tfidf = tf_idf.fit_transform(train_X_preprocessed)
    val_X_tfidf = tf_idf.transform(val_X_preprocessed)

    # train model and log metrics into MLflow
    with mlflow.start_run():
        # Execute ElasticNet
        clf = MultinomialNB(alpha=alpha)

        print("Fitting model.")
        clf.fit(train_X_tfidf, train_y)

        # Evaluate Metrics
        predicted_labels = clf.predict(val_X_tfidf)
        predicted_probas = clf.predict_proba(val_X_tfidf)
        (acc, f1_score) = eval_metrics(val_y, predicted_labels)
        auc = get_roc_auc(val_y, predicted_probas)

        # Print out metrics
        print("MNB model (alpha=%f):" % (alpha))
        print("  ACC: %s" % acc)
        # print("  Precision: %s" % avg_precision)
        print("  F1: %s" % f1_score)
        print("  AUC: %s" % auc)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param("alpha", alpha)
        mlflow.log_metric("acc", acc)
        # mlflow.log_metric("precision", avg_precision)
        mlflow.log_metric("F1", f1_score)
        mlflow.log_metric("AUC", auc)

        mlflow.sklearn.log_model(clf, "model")

In [23]:
train(alpha=1, load_pre_process=True)

Loading preprocessed data.
Calculating TFIDF vectors.
Fitting model.
MNB model (alpha=1.000000):
  ACC: 0.6486291801274775
  F1: 0.6507643257219232
  AUC: 0.9145828781938642
