In [None]:
import pandas as pd
import numpy as np
from preprocessing import * 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from tqdm import tqdm
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
df1 = pd.read_csv('../data/all_labelled_17Oct.csv')

In [None]:
df1.columns = ['index', 'sentence', 'relevance', 'carbon_class']
df1['cleaned_sentence'] = df1['sentence'].apply(clean_sentence)
df1 = df1[df1['carbon_class'].notnull()]
df1 = df1.astype({'carbon_class':int})

In [None]:
train, val, test = \
              np.split(df1.sample(frac=1, random_state=4103), 
                       [int(.6*len(df1)), int(.8*len(df1))])
trainval =pd.concat([train, val])# Split labelled set
labels = [train.carbon_class, val.carbon_class, test.carbon_class, trainval.carbon_class]

# Initialise Best Models & Vectorizers

In [None]:
LR_MODEL = LogisticRegression
BEST_LR = list(ParameterGrid({'C':[0.1], 'class_weight':['balanced'], 'penalty':['l2'], 'solver': ['lbfgs']}))
LR_VECT = TfidfVectorizer
BEST_LR_VECT = list(ParameterGrid({'analyzer':['word'], 'lowercase':[True], 'max_df':[0.25], 'min_df':[1], 'ngram_range':[(1,1)]}))
BEST_LR_SENTENCE = 'cleaned_sentence'
LR = [LR_MODEL, BEST_LR, LR_VECT, BEST_LR_VECT, BEST_LR_SENTENCE]

CB_MODEL = CatBoostClassifier
BEST_CB = list(ParameterGrid({'depth':[1], 'iterations':[50], 'learning_rate':[1], 'verbose':[False]}))
CB_VECT = TfidfVectorizer
BEST_CB_VECT = list(ParameterGrid({'analyzer':['word'], 'lowercase':[True], 'max_df':[0.25], 'min_df':[1], 'ngram_range':[(1,3)]}))
BEST_CB_SENTENCE = 'cleaned_sentence'
CB = [CB_MODEL, BEST_CB, CB_VECT, BEST_CB_VECT, BEST_CB_SENTENCE]

SVM_MODEL = SVC
BEST_SVM_HARD = list(ParameterGrid({'C':[0.1], 'class_weight':['balanced'], 'gamma':['scale'], 'kernel': ['sigmoid']}))
BEST_SVM_SOFT = list(ParameterGrid({'C':[0.1], 'class_weight':['balanced'], 'gamma':['scale'], 'kernel': ['sigmoid'],'probability':[True]}))
SVM_VECT = TfidfVectorizer
BEST_SVM_VECT = list(ParameterGrid({'analyzer':['word'], 'lowercase':[True], 'max_df':[0.25], 'min_df':[1], 'ngram_range':[(1,1)]}))
BEST_SVM_SENTENCE = 'sentence'
SVM_HARD = [SVM_MODEL, BEST_SVM_HARD, SVM_VECT, BEST_SVM_VECT, BEST_SVM_SENTENCE]
SVM_SOFT = [SVM_MODEL, BEST_SVM_SOFT, SVM_VECT, BEST_SVM_VECT, BEST_SVM_SENTENCE]

NB_MODEL = MultinomialNB
BEST_NB = list(ParameterGrid({'alpha':[1]}))
NB_VECT = TfidfVectorizer
BEST_NB_VECT = list(ParameterGrid({'analyzer':['word'], 'lowercase':[True], 'max_df':[0.25], 'min_df':[1], 'ngram_range':[(1,3)]}))
BEST_NB_SENTENCE = 'cleaned_sentence'
NB = [NB_MODEL, BEST_NB, NB_VECT, BEST_NB_VECT, BEST_NB_SENTENCE]

RF_MODEL = RandomForestClassifier
BEST_RF = list(ParameterGrid({'class_weight':['balanced'], 'criterion':['entropy'], \
                       'min_samples_leaf':[1], 'min_samples_split':[5], 'max_features':['log2']}))
RF_VECT = CountVectorizer
BEST_RF_VECT = list(ParameterGrid({'analyzer':['word'], 'lowercase':[True], 'max_df':[1], 'min_df':[1], 'ngram_range':[(1,1)]}))
BEST_RF_SENTENCE = 'cleaned_sentence'
RF = [RF_MODEL, BEST_RF, RF_VECT,BEST_RF_VECT, BEST_RF_SENTENCE]

In [None]:
ensemble_hard = [LR, CB, SVM_HARD, NB, RF]
ensemble_soft = [LR, CB, SVM_SOFT, NB, RF]

In [None]:
def oversample_smote(X,y):
    smote = SMOTE(random_state = 4103)
    X, y = smote.fit_resample(X, y)
    return X,y

In [None]:
def vectorize_helper(vect, sentence_version):
    vec_train = vect.fit_transform(train[sentence_version])
    vec_val = vect.transform(val[sentence_version])
    vec_test = vect.transform(test[sentence_version])
    vec_trainval = vect.transform(trainval[sentence_version])
    
    vec_train_oversampled = oversample_smote(vec_train, labels[0])
    vec_trainval_oversampled = oversample_smote(vec_trainval, labels[3])
    return vec_train_oversampled, vec_val, vec_test, vec_trainval_oversampled

# Tfidf Heuristics

In [None]:
class_zero = ["emissions","footprint","ghg", "coal"]
class_one = ["energy","renewable","electricity","power", "solar", "kwh"]
class_two = ["waste","paper", "office","recycled","environmental"]
class_three = ["sustainable","investment","investments","bonds", "portfolio", "finance"]

# class_zero = ["emissions","footprint","ghg"]
# class_one = ["energy","renewable","electricity","power", "solar"]
# class_two = ["waste","recycled","environmental"]
# class_three = ["susstainable","investment","investments","bonds","finance"]

def carbon_class_filter(row):
    sentence = row["sentence"]
    if any(map(sentence.__contains__, class_zero)):
        return 0
    elif any(map(sentence.__contains__, class_one)):
        return 1
    elif any(map(sentence.__contains__, class_two)):
        return 2
    elif any(map(sentence.__contains__, class_three)):
        return 3
    else:
        return 4

In [None]:
df_all_cc = df1[df1.relevance==1]
df_all_cc['heu'] = df_all_cc.apply(carbon_class_filter, axis=1)

In [None]:
classification_report(df_all_cc.carbon_class, df_all_cc.heu, output_dict=True)

# Generate predictions for val and test

In [None]:
test['heu'] = test.apply(carbon_class_filter, axis=1)

# Hard Voting

In [None]:
def generate_ensemble_pred_hard(ensemble):
    df_test = []
    for model_fn, model_param, vect_fn, vect_param, sentence_ver in ensemble:
        for vp in vect_param:
            vect = vect_fn(**vp)
        train_os, val, test, trainval_os = vectorize_helper(vect, sentence_ver)
        train_label = train_os[1]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = trainval_os[1]
        
        # test pred
        for mp in model_param:
            model_tv = model_fn(**mp)
        model_tv.fit(trainval_os[0], trainval_label)
        test_pred = model_tv.predict(test)
        df_test.append(test_pred)        
    test_pred = pd.DataFrame(df_test).T  
    test_pred.columns = ['lr', 'cb', 'svm', 'nb', 'rf']
    test_pred['cb'] = test_pred['cb'].apply(lambda x: int(x[0]))
    return test_pred  

def get_majority_pred_hard(df):
    final_pred = []
    for j in df.iterrows():
        lst = list(i for i in j[1])
        pred = max(set(lst), key=lst.count)
        final_pred.append(pred)
    return final_pred

In [None]:
hard_df = generate_ensemble_pred_hard(ensemble_hard)
hard_df['heu'] = list(test.heu) 
hard_pred = get_majority_pred_hard(hard_df)

In [None]:
classification_report(labels[2], hard_pred, output_dict=True)

# Soft Voting

In [None]:
def generate_ensemble_pred_soft(ensemble):
    df_test = []
    for model_fn, model_param, vect_fn, vect_param, sentence_ver in ensemble:
        for vp in vect_param:
            vect = vect_fn(**vp)
        train_os, val, test, trainval_os = vectorize_helper(vect, sentence_ver)
        train_label = train_os[1]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = trainval_os[1]
        
        # test pred
        for mp in model_param:
            model_tv = model_fn(**mp)
        model_tv.fit(trainval_os[0], trainval_label)
        test_pred = model_tv.predict_proba(test)
        df_test.append(test_pred)  
    test_pred = pd.concat([pd.DataFrame(df_test[i]) for i in range(0,5)], axis=1)
    cols = ['lr_0', 'lr_1', 'lr_2', 'lr_3', 'lr_4', 
            'cb_0', 'cb_1', 'cb_2', 'cb_3', 'cb_4',
            'svm_0', 'svm_1', 'svm_2', 'svm_3', 'svm_4',
            'nb_0', 'nb_1', 'nb_2', 'nb_3', 'nb_4',
            'rf_0', 'rf_1', 'rf_2', 'rf_3', 'rf_4',
           ]
    test_pred.columns = cols
    return test_pred

def sum_probs(df, heu_preds):
    df['0_total'] = df['lr_0'] + df['cb_0'] + df['svm_0'] + df['nb_0'] + df['rf_0']
    df['1_total'] = df['lr_1'] + df['cb_1'] + df['svm_1'] + df['nb_1'] + df['rf_1']
    df['2_total'] = df['lr_2'] + df['cb_2'] + df['svm_2'] + df['nb_2'] + df['rf_2']
    df['3_total'] = df['lr_3'] + df['cb_3'] + df['svm_3'] + df['nb_3'] + df['rf_3']
    df['4_total'] = df['lr_4'] + df['cb_4'] + df['svm_4'] + df['nb_4'] + df['rf_4']
    probs = df[['0_total','1_total','2_total','3_total','4_total']]
    for i in range(len(heu_preds)):
        pred = heu_preds[i]
        to_increase = '{pred}_total'.format(pred=pred)
        probs.at[i,to_increase] += 1
    return probs

def get_majority_pred_soft(df):
    final_pred = []
    for i in df.iterrows():
        lst = [j for j in i[1]]   
        max_value = max(lst)
        soft_pred = lst.index(max_value)
        final_pred.append(soft_pred)
    return final_pred

In [None]:
soft_df = generate_ensemble_pred_soft(ensemble_soft)
soft_sum_probs = sum_probs(soft_df, list(test.heu))
all_pred_soft = get_majority_pred_soft(soft_sum_probs)

In [None]:
classification_report(labels[2], all_pred_soft, output_dict=True)