In [None]:
import pandas as pd
import numpy as np
from preprocessing import * 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from tqdm import tqdm
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
df1 = pd.read_csv('../data/all_labelled_17Oct.csv')

# Preprocess

In [None]:
df1.columns = ['index', 'sentence', 'relevance', 'carbon_class']
df1['cleaned_sentence'] = df1['sentence'].apply(clean_sentence)
df1 = df1[df1['carbon_class'].notnull()]
df1 = df1.astype({'carbon_class':int})

# Split labelled set

In [None]:
train, val, test = \
              np.split(df1.sample(frac=1, random_state=4103), 
                       [int(.6*len(df1)), int(.8*len(df1))])
trainval =pd.concat([train, val])# Split labelled set
labels = [train.carbon_class, val.carbon_class, test.carbon_class, trainval.carbon_class]

# Parameters

In [None]:
vect_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1), (1,2), (1,3)],
    "max_df": [0.25, 0.5, 1.0],
    "min_df": [1, 10, 20]
}
vect_paramgrid = list(ParameterGrid(vect_params))

In [None]:
vect_methods = {'type': ['bow', 'tfidf'],'processing': ['clean','raw']}
vect_methods_paramgrid = list(ParameterGrid(vect_methods))

In [None]:
# logistic regression
logreg_params = { "C": [0.1, 0.5, 1.0, 5], 
                 "solver": ["lbfgs", "newton-cg"], 
                 "penalty": ["l2", "none"],
                 "class_weight": ["balanced", None]}
logreg_paramgrid = list(ParameterGrid(logreg_params))

# naive bayes
nb_params = {"alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]}
nb_paramgrid = list(ParameterGrid(nb_params))

# svm
svm_params = { "C": [0.1, 0.5, 1, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

# rf
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["auto","log2"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

# dummy classifier
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

In [None]:
cb_params = {'iterations': [50, 100, 200],
             'depth': [1,2,5],
             'learning_rate': [0.01, 0.1, 0.5, 1]}
cb_paramgrid = list(ParameterGrid(cb_params))

# Grid Search

In [None]:
def oversample_smote(X,y):
    smote = SMOTE(random_state = 4103)
    X, y = smote.fit_resample(X, y)
    return X,y

In [None]:
def vectorize_helper(vect, sentence_version):
    vec_train = vect.fit_transform(train[sentence_version])
    vec_val = vect.transform(val[sentence_version])
    vec_test = vect.transform(test[sentence_version])
    vec_trainval = vect.transform(trainval[sentence_version])
    
    vec_train_oversampled = oversample_smote(vec_train, labels[0])
    vec_trainval_oversampled = oversample_smote(vec_trainval, labels[3])
    return vec_train_oversampled, vec_val, vec_test, vec_trainval_oversampled

In [None]:
def vectorize_dataset(vect_param):
    # bag of words
    
    # raw
    bow = CountVectorizer(**vect_param)
    bow_train_oversampled, bow_val, bow_test, bow_trainval_oversampled = vectorize_helper(bow, 'sentence')

    # cleaned
    bow = CountVectorizer(**vect_param)
    bow_train_clean_oversampled, bow_clean_val, bow_clean_test, bow_trainval_clean_oversampled = vectorize_helper(bow, 'cleaned_sentence')
    
    
    # tfidf
    
    # raw
    tfidf = TfidfVectorizer(**vect_param)
    tfidf_train_oversampled, tfidf_val, tfidf_test, tfidf_trainval_oversampled = vectorize_helper(tfidf, 'sentence')
    
    # cleaned
    tfidf = TfidfVectorizer(**vect_param)    
    tfidf_train_clean_oversampled, tfidf_clean_val, tfidf_clean_test, tfidf_trainval_clean_oversampled = vectorize_helper(tfidf, 'cleaned_sentence')

    return {'bow': {'clean': [bow_train_clean_oversampled, bow_clean_val, bow_clean_test, bow_trainval_clean_oversampled],
                    'raw': [bow_train_oversampled, bow_val, bow_test, bow_trainval_oversampled]},
            'tfidf': {'clean': [tfidf_train_clean_oversampled, tfidf_clean_val, tfidf_clean_test, tfidf_trainval_clean_oversampled],
                     'raw':[tfidf_train_oversampled, tfidf_val, tfidf_test, tfidf_trainval_oversampled]}
           }

In [None]:
def hyperparam_search(model, model_function, model_grid):
    ind = 0 
    gridsearch_results = []
    for vect_param in tqdm(vect_paramgrid):
        dataset = vectorize_dataset(vect_param)
        for vect_choice in vect_methods_paramgrid:
            vect_type  = vect_choice['type']
            sentence_proc = vect_choice['processing']
            train_hp = dataset[vect_type][sentence_proc][0][0]
            val_hp = dataset[vect_type][sentence_proc][1]
            test_hp = dataset[vect_type][sentence_proc][2]
            trainval_hp = dataset[vect_type][sentence_proc][3][0]

            train_label = dataset[vect_type][sentence_proc][0][1]
            val_label = labels[1]
            test_label = labels[2]
            trainval_label = dataset[vect_type][sentence_proc][3][1]

            for model_param in model_grid:
                # fit model on train set
                model = model_function(**model_param)
                model.fit(train_hp, train_label)
                val_pred = model.predict(val_hp)

                # scoring
                val_metrics = classification_report(val_label, val_pred, output_dict=True)
                val_accuracy = val_metrics["accuracy"]
                val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
                val_f1_zero = val_metrics["0"]["f1-score"]
                val_f1_one = val_metrics["1"]["f1-score"]
                val_f1_two = val_metrics["2"]["f1-score"]
                val_f1_three = val_metrics["3"]["f1-score"]
                val_f1_four = val_metrics["4"]["f1-score"]
                
                # fit model on trainval set
                model = model_function(**model_param)
                model.fit(trainval_hp, trainval_label)
                test_pred = model.predict(test_hp)

                # scoring
                test_metrics = classification_report(test_label, test_pred, output_dict=True)
                test_accuracy = test_metrics["accuracy"]
                test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
                test_f1_zero = test_metrics["0"]["f1-score"]
                test_f1_one = test_metrics["1"]["f1-score"]
                test_f1_two = test_metrics["2"]["f1-score"]
                test_f1_three = test_metrics["3"]["f1-score"]
                test_f1_four = test_metrics["4"]["f1-score"]


                results = {"model": model}
                results.update(vect_param)
                results.update(vect_choice)
                results.update(model_param)
                results.update({"val_f1_weighted": val_f1_weighted,  
                                "val_f1_zero": val_f1_zero,
                                "val_f1_one": val_f1_one,
                                "val_f1_two": val_f1_two,
                                "val_f1_three": val_f1_three,
                                "val_f1_four": val_f1_four,
                                "val_accuracy": val_accuracy})
                results.update({"test_f1_weighted": test_f1_weighted, 
                                "test_f1_zero": test_f1_zero,
                                "test_f1_one": test_f1_one,
                                "test_f1_two": test_f1_two,
                                "test_f1_three": test_f1_three,
                                "test_f1_four": test_f1_four,
                                "test_accuracy": test_accuracy})
                gridsearch_results.append(results)
                ind += 1
    final_results = pd.DataFrame.from_records(gridsearch_results)
    final_results = final_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
    return final_results

# Log Reg

In [None]:
final_lr_results = hyperparam_search("log_reg", LogisticRegression, logreg_paramgrid)
final_lr_results.to_csv("model_results/bowtfidf/logreg_smote.csv", index=False)

# Naive Bayes

In [None]:
final_nb_results = hyperparam_search("nb", MultinomialNB, nb_paramgrid)
final_nb_results.to_csv("model_results/bowtfidf/nb_smote.csv", index=False)

# SVM

In [None]:
final_svm_results = hyperparam_search("svm", SVC, svm_paramgrid)
final_svm_results.to_csv("model_results/bowtfidf/svm_smote.csv", index=False)

# RF

In [None]:
final_rf_results = hyperparam_search("rf", RandomForestClassifier, rf_paramgrid)
final_rf_results.to_csv("model_results/bowtfidf/rf_smote.csv", index=False)

# CatBoost

In [None]:
final_cb_results = hyperparam_search("catboost", CatBoostClassifier, cb_paramgrid)
final_cb_results.to_csv("model_results/bowtfidf/cb_smote.csv", index=False)

# Base Classifier

In [None]:
final_dummy_results = hyperparam_search("dummy", DummyClassifier, dummy_paramgrid)
final_dummy_results.to_csv("model_results/bowtfidf/dummy.csv", index=False)