In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from tqdm import tqdm
from imblearn.over_sampling import RandomOverSampler 

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# Split labelled set

In [None]:
df1 = pd.read_csv('../data/all_labelled_7Oct.csv')
df1.columns = ['index', 'sentence', 'relevance', 'carbon_class']
train, val, test = \
              np.split(df1.sample(frac=1, random_state=4103), 
                       [int(.6*len(df1)), int(.8*len(df1))])
trainval =pd.concat([train, val])

In [None]:
labels = [train.relevance, val.relevance, test.relevance, trainval.relevance]

# Parameters

In [None]:
vect_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1), (1,2), (1,3)],
    "max_df": [0.25, 0.5, 1.0],
    "min_df": [1, 10, 20]
}
vect_paramgrid = list(ParameterGrid(vect_params))

In [None]:
vect_methods = {'type': ['bow', 'tfidf']}
vect_methods_paramgrid = list(ParameterGrid(vect_methods))

In [None]:
# logistic regression
logreg_params = { "C": [0.1, 0.5, 1.0, 1.5, 5], 
                 "solver": ["lbfgs", "newton-cg"], 
                 "penalty": ["l2", "none"],
                 "class_weight": ["balanced", None]}
logreg_paramgrid = list(ParameterGrid(logreg_params))

# naive bayes
nb_params = {"alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]}
nb_paramgrid = list(ParameterGrid(nb_params))

# svm
svm_params = { "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

# rf
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["auto","sqrt"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

# dummy classifier
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

# Grid Search

In [None]:
#Oversampling the data
def oversample(X,y):
    ros = RandomOverSampler(random_state = 4103)
    X, y = ros.fit_resample(X, y)
    return X,y

In [None]:
def vectorize_dataset(vect_param):
    # bag of words
    bow = CountVectorizer(**vect_param)
    bow_train = bow.fit_transform(train.sentence)
    bow_val = bow.transform(val.sentence)
    bow_test = bow.transform(test.sentence)
    bow_trainval = bow.transform(trainval.sentence)
    
    # oversample minority class
    bow_train_oversampled = oversample(bow_train, labels[0])
    bow_trainval_oversampled = oversample(bow_trainval, labels[3])
    
    # tfidf 
    tfidf = TfidfVectorizer(**vect_param)
    tfidf_train = tfidf.fit_transform(train.sentence)
    tfidf_val = tfidf.transform(val.sentence)
    tfidf_test = tfidf.transform(test.sentence)
    tfidf_trainval = tfidf.transform(trainval.sentence)
    
    # oversample minority class
    tfidf_train_oversampled = oversample(tfidf_train, labels[0])
    tfidf_trainval_oversampled = oversample(tfidf_trainval, labels[3])
    
    return {'bow': [bow_train_oversampled, bow_val, bow_test, bow_trainval_oversampled],
            'tfidf': [tfidf_train_oversampled, tfidf_val, tfidf_test, tfidf_trainval_oversampled]
           }

In [None]:
def hyperparam_search(model, model_function, model_grid):
    ind = 0 
    gridsearch_results = []
    for vect_param in tqdm(vect_paramgrid):
        dataset = vectorize_dataset(vect_param)
        for vect_choice in vect_methods_paramgrid:
            vect = vect_choice['type']
            train_hp = dataset[vect][0][0]
            val_hp = dataset[vect][1]
            test_hp = dataset[vect][2]
            trainval_hp = dataset[vect][3][0]

            train_label = dataset[vect][0][1]
            val_label = labels[1]
            test_label = labels[2]
            trainval_label = dataset[vect][3][1]

            for model_param in model_grid:
                # fit model on train set
                model = model_function(**model_param)
                model.fit(train_hp, train_label)
                val_pred = model.predict(val_hp)

                # scoring
                val_metrics = classification_report(val_label, val_pred, output_dict=True)
                val_accuracy = val_metrics["accuracy"]
                val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
                val_f1_zero = val_metrics["0"]["f1-score"]
                val_f1_one = val_metrics["1"]["f1-score"]

                # fit model on trainval set
                model = model_function(**model_param)
                model.fit(trainval_hp, trainval_label)
                test_pred = model.predict(test_hp)

                # scoring
                test_metrics = classification_report(test_label, test_pred, output_dict=True)
                test_accuracy = test_metrics["accuracy"]
                test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
                test_f1_zero = test_metrics["0"]["f1-score"]
                test_f1_one = test_metrics["1"]["f1-score"]

                results = {"model": model}
                results.update(vect_param)
                results.update(vect_choice)
                results.update(model_param)
                results.update({"val_f1_weighted": val_f1_weighted,  
                                "val_f1_zero": val_f1_zero,
                                "val_f1_one": val_f1_one,
                                "val_accuracy": val_accuracy})
                results.update({"test_f1_weighted": test_f1_weighted, 
                                "test_f1_zero": test_f1_zero,
                                "test_f1_one": test_f1_one,
                                "test_accuracy": test_accuracy})
                gridsearch_results.append(results)
                ind += 1
    final_results = pd.DataFrame.from_records(gridsearch_results)
    final_results = final_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
    return final_results

# Log Reg

In [None]:
final_lr_results = hyperparam_search("log_reg", LogisticRegression, logreg_paramgrid)
final_lr_results.to_csv("model_results/bowtfidf/logreg.csv", index=False)

# Naive Bayes

In [None]:
final_nb_results = hyperparam_search("nb", MultinomialNB, nb_paramgrid)
final_nb_results.to_csv("model_results/bowtfidf/nb.csv", index=False)

# SVM

In [None]:
final_svm_results = hyperparam_search("svm", SVC, svm_paramgrid)
final_svm_results.to_csv("model_results/bowtfidf/svm.csv", index=False)

# RF

In [None]:
final_rf_results = hyperparam_search("rf", RandomForestClassifier, rf_paramgrid)
final_rf_results.to_csv("model_results/bowtfidf/rf.csv", index=False)

# Base Classifier

In [None]:
final_dummy_results = hyperparam_search("dummy", DummyClassifier, dummy_paramgrid)
final_dummy_results.to_csv("model_results/bowtfidf/dummy.csv", index=False)