In [None]:
import pandas as pd
import numpy as np
from preprocessing import * 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from tqdm import tqdm
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import ParameterGrid

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from bert_serving.client import BertClient

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
df1 = pd.read_csv('../data/labelled_data/all_labelled_17Oct.csv')

In [None]:
def remove_punc(s):
    import string
    exclude = string.punctuation
    final_punc = ''.join(list(i for i in exclude if i not in ['$', '&', '%']))
    s = ''.join(ch for ch in s if ch not in list(final_punc))
    return s

In [None]:
df1.columns = ['index', 'sentence', 'relevance', 'carbon_class']
df1['cleaned_sentence'] = df1['sentence'].apply(clean_sentence)
df1['sentence_no_punc'] = df1['sentence'].map(remove_punc)

In [None]:
train, val, test = \
              np.split(df1.sample(frac=1, random_state=4103), 
                       [int(.6*len(df1)), int(.8*len(df1))])
trainval =pd.concat([train, val])# Split labelled set
labels = [train.relevance, val.relevance, test.relevance, trainval.relevance]

# Initialise Best Models & Vectorizers

In [None]:
LR_MODEL = LogisticRegression
BEST_LR = list(ParameterGrid({
    "C": [1],
    'class_weight': ['balanced'],
    'penalty':['l2'],
    'solver':['lbfgs']
}))
LR_VECT = TfidfVectorizer
BEST_LR_VECT = list(ParameterGrid({
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1)],
    "max_df": [0.25],
    "min_df": [1]
}))
BEST_LR_SENTENCE = 'cleaned_sentence'
LR = [LR_MODEL, BEST_LR, LR_VECT, BEST_LR_VECT, BEST_LR_SENTENCE]

SVM_MODEL = SVC
BEST_SVM_HARD = list(ParameterGrid({
    "C": [0.5],
    "kernel": ["sigmoid"],
    "gamma": ["scale"],
    "class_weight": ['balanced']
}))
BEST_SVM_SOFT = list(ParameterGrid({
    "C": [0.5],
    "kernel": ["sigmoid"],
    "gamma": ["scale"],
    "class_weight": ['balanced'],
    "probability": [True]
}))
SVM_VECT = TfidfVectorizer
BEST_SVM_VECT = list(ParameterGrid({
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,2)],
    "max_df": [0.25],
    "min_df": [1]
}))
BEST_SVM_SENTENCE = 'cleaned_sentence'
SVM_HARD = [SVM_MODEL, BEST_SVM_HARD, SVM_VECT, BEST_SVM_VECT, BEST_SVM_SENTENCE]
SVM_SOFT = [SVM_MODEL, BEST_SVM_SOFT, SVM_VECT, BEST_SVM_VECT, BEST_SVM_SENTENCE]

NB_MODEL = MultinomialNB
BEST_NB = list(ParameterGrid({'alpha':[0.5]}))
NB_VECT = CountVectorizer
BEST_NB_VECT = list(ParameterGrid({
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1)],
    "max_df": [0.25],
    "min_df": [1]
}))
BEST_NB_SENTENCE = 'sentence'
NB = [NB_MODEL, BEST_NB, NB_VECT, BEST_NB_VECT, BEST_NB_SENTENCE]

RF_MODEL = RandomForestClassifier
BEST_RF = list(ParameterGrid({
        "criterion": ["entropy"],
        "min_samples_split": [5],
        "class_weight": ['balanced'],
        "max_features": ["log2"],
        "min_samples_leaf": [2]
    }))
RF_VECT = TfidfVectorizer
BEST_RF_VECT = list(ParameterGrid({
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,2)],
    "max_df": [0.25],
    "min_df": [10]
}))
BEST_RF_SENTENCE = 'cleaned_sentence'
RF = [RF_MODEL, BEST_RF, RF_VECT,BEST_RF_VECT, BEST_RF_SENTENCE]

In [None]:
ensemble_hard = [LR, SVM_HARD, NB, RF]
ensemble_soft = [LR, SVM_SOFT, NB, RF]

In [None]:
def oversample_smote(X,y):
    smote = SMOTE(random_state = 4103)
    X, y = smote.fit_resample(X, y)
    return X,y

def vectorize_helper(vect, sentence_version):
    vec_train = vect.fit_transform(train[sentence_version])
    vec_val = vect.transform(val[sentence_version])
    vec_test = vect.transform(test[sentence_version])
    vec_trainval = vect.transform(trainval[sentence_version])
    
    vec_train_oversampled = oversample_smote(vec_train, labels[0])
    vec_trainval_oversampled = oversample_smote(vec_trainval, labels[3])
    return vec_train_oversampled, vec_val, vec_test, vec_trainval_oversampled

In [None]:
# BERT
LR_BERT = LogisticRegression
BEST_LR_BERT = list(ParameterGrid({
    "C": [0.1],
    'class_weight': ['balanced'],
    'penalty':['l2'],
    'solver':['lbfgs']
}))
LR_BERT_VECT = BertClient(check_length=False)
bert_train = LR_BERT_VECT.encode(list(train.sentence_no_punc))
bert_val = LR_BERT_VECT.encode(list(val.sentence_no_punc))

#oversample
bert_train_smote,bert_train_smote_y = oversample_smote(bert_train, train.relevance)

LR_BERT_MODEL= LogisticRegression(**BEST_LR_BERT[0])
LR_BERT_MODEL.fit(bert_train_smote, bert_train_smote_y)
LR_BERT_VAL_PRED = LR_BERT_MODEL.predict(bert_val)
LR_BERT_VAL_PRED_SOFT = LR_BERT_MODEL.predict_proba(bert_val)


# Hard Voting

In [None]:
def generate_ensemble_pred_hard(ensemble):
    df_val = [LR_BERT_VAL_PRED]
    for model_fn, model_param, vect_fn, vect_param, sentence_ver in ensemble:
        for vp in vect_param:
            vect = vect_fn(**vp)
        train_os, val, test, trainval_os = vectorize_helper(vect, sentence_ver)
        train_label = train_os[1]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = trainval_os[1]
        
        # val pred
        for mp in model_param:
            model_tv = model_fn(**mp)
        model_tv.fit(train_os[0], train_label)
        val_pred = model_tv.predict(val)
        df_val.append(val_pred)        
    val_pred = pd.DataFrame(df_val).T  
    val_pred.columns = ['lr_bert', 'lr', 'svm', 'nb', 'rf']
    return val_pred  

def get_majority_pred_hard(df):
    final_pred = []
    for j in df.iterrows():
        lst = list(i for i in j[1])
        pred = max(set(lst), key=lst.count)
        final_pred.append(pred)
    return final_pred

In [None]:
hard_df = generate_ensemble_pred_hard(ensemble_hard)
hard_pred = get_majority_pred_hard(hard_df)

In [None]:
classification_report(labels[1], hard_pred, output_dict=True)

# Soft Voting

In [None]:
def generate_ensemble_pred_soft(ensemble):
    df_val = [LR_BERT_VAL_PRED_SOFT]
    for model_fn, model_param, vect_fn, vect_param, sentence_ver in ensemble:
        for vp in vect_param:
            vect = vect_fn(**vp)
        train_os, val, test, trainval_os = vectorize_helper(vect, sentence_ver)
        train_label = train_os[1]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = trainval_os[1]
        
        # val pred
        for mp in model_param:
            model_tv = model_fn(**mp)
        model_tv.fit(train_os[0], train_label)
        val_pred = model_tv.predict_proba(val)
        df_val.append(val_pred)  
    val_pred = pd.concat([pd.DataFrame(df_val[i]) for i in range(0,5)], axis=1)
    cols = ['bert_lr_0', 'bert_lr_1',
            'lr_0', 'lr_1', 
            'svm_0', 'svm_1',
            'nb_0', 'nb_1', 
            'rf_0', 'rf_1'
           ]
    val_pred.columns = cols
    return val_pred

def sum_probs(df):
    df['0_total'] = df['bert_lr_0'] + df['lr_0'] + df['svm_0'] + df['nb_0'] + df['rf_0']
    df['1_total'] = df['bert_lr_1'] + df['lr_1'] + df['svm_1'] + df['nb_1'] + df['rf_1']
    probs = df[['0_total','1_total']]
    return probs

def get_majority_pred_soft(df):
    final_pred = []
    for i in df.iterrows():
        lst = [j for j in i[1]]   
        max_value = max(lst)
        soft_pred = lst.index(max_value)
        final_pred.append(soft_pred)
    return final_pred

In [None]:
soft_df = generate_ensemble_pred_soft(ensemble_soft)
soft_sum_probs = sum_probs(soft_df)
all_pred_soft = get_majority_pred_soft(soft_sum_probs)

In [None]:
classification_report(labels[1], all_pred_soft, output_dict=True)