## Read in data

In [4]:
import os
import json
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from tqdm import tqdm
from imblearn.over_sampling import RandomOverSampler,SMOTE

from sklearn.model_selection import ParameterGrid
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier


from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")


In [5]:
DATA_PATH = "../data/labelled_data/all_labelled_17Oct.csv" 
data = pd.read_csv(DATA_PATH)
data.columns = ['index', 'sentence', 'relevance', 'carbon_class']


In [7]:
data.relevance.value_counts()

0    4775
1     464
Name: relevance, dtype: int64

### Preprocess text before BERT creating word emeddings 
1. ONLY remove all punctuations except except %,$,&

1. DO NOT remove numbers because we want to capture numbers
2. DO NOT lemmatize/lowercase etc because BERT does not require it 

In [None]:
def remove_numbers(string):
    return ''.join(i for i in string if not i.isdigit())

def remove_punc(s):
    import string
    exclude = string.punctuation
    final_punc = ''.join(list(i for i in exclude if i not in ['%', '$', '&']))
    s = ''.join(ch for ch in s if ch not in list(final_punc))
    return s


In [None]:
data.sentence = data.sentence.map(remove_punc)
#data = data.loc[data.carbon_class.notnull()] # comment out for relevance

In [13]:
train, val, test = \
              np.split(data.sample(frac=1, random_state=4103), 
                       [int(.6*len(data)), int(.8*len(data))])
trainval =pd.concat([train, val])
labels = [train.relevance, val.relevance, test.relevance, trainval.relevance]

In [23]:
test.relevance.value_counts() / (943+105)

0    0.899809
1    0.100191
Name: relevance, dtype: float64

### BERT embeddings


In [None]:
%tensorflow_version 1.x

!pip install bert-serving-client
!pip install -U bert-serving-server[http]

!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip
!nohup bert-serving-start -model_dir=./uncased_L-12_H-768_A-12 > out.file 2>&1 &


!ls  # you should see uncased_something_.zip



In [None]:
vect_methods = {'type': ['bert_as_a_service']}
vect_methods_paramgrid = list(ParameterGrid(vect_methods))

### Grid Search

In [None]:
# logistic regression
logreg_params = { "C": [0.1, 0.5, 1.0, 1.5, 5], 
                 "solver": ["lbfgs", "newton-cg"], 
                 "penalty": ["l2", "none"],
                 "class_weight": ["balanced", None]}
logreg_paramgrid = list(ParameterGrid(logreg_params))


# svm
svm_params = { "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [None]:
#Oversampling the data
def oversample_ros(X,y):
    ros = RandomOverSampler(random_state = 4103,sampling_strategy=1.0)
    # label encode the target variable
    X, y = ros.fit_resample(X, y)
    return X,y

def oversample_smote(X,y):
    smote = SMOTE(random_state = 4103)
    X, y = smote.fit_resample(X, y)
    return X,y

In [None]:
def vectorize_dataset_ros():
    # BERT as a Service
    from bert_serving.client import BertClient
    bc = BertClient(check_length=False)

    bert_train = bc.encode(list(train.sentence))
    bert_val = bc.encode(list(val.sentence))
    bert_test = bc.encode(list(test.sentence))
    bert_trainval = bc.encode(list(trainval.sentence))
    
    # oversample minority class
    bert_train_oversampled = oversample_ros(bert_train, labels[0])
    bert_trainval_oversampled = oversample_ros(bert_trainval, labels[3])
    
    return {'bert_as_a_service': [bert_train_oversampled, bert_val, bert_test, bert_trainval_oversampled]}


def vectorize_dataset_smote():
    # BERT as a Service
    from bert_serving.client import BertClient
    bc = BertClient(check_length=False)

    bert_train = bc.encode(list(train.sentence))
    bert_val = bc.encode(list(val.sentence))
    bert_test = bc.encode(list(test.sentence))
    bert_trainval = bc.encode(list(trainval.sentence))
    
    # oversample minority class
    bert_train_oversampled = oversample_smote(bert_train, labels[0])
    bert_trainval_oversampled = oversample_smote(bert_trainval, labels[3])
    
    return {'bert_as_a_service': [bert_train_oversampled, bert_val, bert_test, bert_trainval_oversampled]}


In [None]:
def hyperparam_search(dataset,model, model_function, model_grid):
    ind = 0 
    gridsearch_results = []
    dataset = dataset
    for vect_choice in vect_methods_paramgrid:
        vect = vect_choice['type']
        train_hp = dataset[vect][0][0]
        val_hp = dataset[vect][1]
        test_hp = dataset[vect][2]
        trainval_hp = dataset[vect][3][0]

        train_label = dataset[vect][0][1]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = dataset[vect][3][1]

        for model_param in model_grid:
            # fit model on train set
            model = model_function(**model_param)
            model.fit(train_hp, train_label)
            val_pred = model.predict(val_hp)

            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_zero = val_metrics["0"]["f1-score"]
            val_f1_one = val_metrics["1"]["f1-score"]

            # fit model on trainval set
            model = model_function(**model_param)
            model.fit(trainval_hp, trainval_label)
            test_pred = model.predict(test_hp)

            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_zero = test_metrics["0"]["f1-score"]
            test_f1_one = test_metrics["1"]["f1-score"]

            results = {"model": model}
            results.update(vect_choice)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted,  
                            "val_f1_zero": val_f1_zero,
                            "val_f1_one": val_f1_one,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, 
                            "test_f1_zero": test_f1_zero,
                            "test_f1_one": test_f1_one,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
    final_results = pd.DataFrame.from_records(gridsearch_results)
    final_results = final_results.sort_values(by=["val_f1_one", "test_f1_one"], ascending=False)
    return final_results

# Log Reg

SMOTE performs better than ROS


In [None]:
bert_embeddings_ros = vectorize_dataset_ros()
bert_embeddings_smote = vectorize_dataset_smote()

final_lr_results_ros = hyperparam_search(bert_embeddings_ros,"log_reg", LogisticRegression, logreg_paramgrid)
final_lr_results_smote = hyperparam_search(bert_embeddings_smote,"log_reg", LogisticRegression, logreg_paramgrid)

#final_lr_results.to_csv("model_results/bowtfidf/logreg.csv", index=False)

In [None]:
final_lr_results_ros.model[0]

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
final_lr_results_smote.model[0]

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Save best model

In [None]:
#best params
lr_params = {
    "C": [0.1],
    'class_weight': ['balanced'],
    'penalty':['l2'],
    'solver':['lbfgs']
}
from bert_serving.client import BertClient
bc = BertClient(check_length=False)
lr_paramgrid = list(ParameterGrid(lr_params))

bert_train = bc.encode(list(data.sentence))
#oversample
bert_train_smote,bert_train_smote_y = oversample_smote(bert_train,data.relevance)

final_model = LogisticRegression(**lr_paramgrid[0])
final_model.fit(bert_train_smote, bert_train_smote_y)
DATA_FOLDER = "../data/"

model_pkl_filename = DATA_FOLDER + "saved_models/relevance_models/model_LR_BERT.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)


# SVM

SMOTE performs better than ROS


In [None]:
final_svm_results_smote = hyperparam_search(bert_embeddings_smote,"svm", SVC, svm_paramgrid)

In [None]:
final_svm_results_smote.model[0]

SVC(C=0.1, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
final_svm_results_smote[0]

In [None]:
final_svm_results_ros = hyperparam_search(bert_embeddings_ros,"svm", SVC, svm_paramgrid)

final_svm_results_ros.model[0]

In [None]:
final_svm_results_ros[0]

### Save best model

In [None]:
# Instantiate model grid that gives highest validtion weighted F1 (Class 1)
svm_params = {
    "C": [0.1],
    "kernel": ["poly"],
    "gamma": ["scale"],
    "class_weight": ['balanced'],
    "probability": [True]
}


from bert_serving.client import BertClient
bc = BertClient(check_length=False)

bert_train = bc.encode(list(data.sentence))
#oversample
bert_train_smote,bert_train_smote_y = oversample_smote(bert_train,data.relevance)
svm_paramgrid = list(ParameterGrid(svm_params))

final_model = SVC(**svm_paramgrid[0])
final_model.fit(bert_train_smote, bert_train_smote_y)
DATA_FOLDER = "../data/"

model_pkl_filename = DATA_FOLDER + "saved_models/relevance_models/model_SVM_BERT.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)


## Generate Fold Predictions for Stacking

In [None]:
def custom_k_fold(model_grid, column, data, model_name, model_fn):
    
    # Generate fold predictions
    fold_num = 1
    for tf_combi in data:
        train = tf_combi[0]
        predict_on = tf_combi[1]
        predict_label = predict_on.relevance
        
        # Vectorise Data
        # for v in vectorizer_grid:
        #     vectorizer = vectFunc(**v)
        # vec_train = vectorizer.fit_transform(train[column])
        # vec_predict_on = vectorizer.transform(predict_on[column])
        
        # Vectorise Data
        from bert_serving.client import BertClient
        bc = BertClient(check_length=False)

        vec_train = bc.encode(list(train[column]))
        vec_predict_on = bc.encode(list(predict_on[column]))


        # Get Labels
        train_label = train.relevance
        
        # Oversample
        vec_train_over, train_label_over = oversample_smote(vec_train, train_label)
        
        # Fit Model
        for m in model_grid:
            model = model_fn(**m)
        model.fit(vec_train_over, train_label_over)
        predictions = model.predict_proba(vec_predict_on)
        
        # Create Dataframe and output
        df = pd.DataFrame(data=predictions, columns = [model_name+'_prob_0', model_name+'_prob_1'])

        
        if fold_num <=5:
            path = DATA_FOLDER + "fold_predictions/" + model_name + "/" + model_name + '_fold' + str(fold_num) +'.csv'
        else:
            path = DATA_FOLDER + "fold_predictions/" + model_name + "/" + model_name + '_test.csv'
        
        df.to_csv(path, index=False)
        
        fold_num +=1

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=4013, shuffle=True)

In [None]:
ind=1
for train_index, test_index in kf.split(trainval.reset_index()):
    tr = trainval.reset_index().iloc[train_index]
    val = trainval.reset_index().iloc[test_index]
    tr.to_csv('folds/train_folds_{ind}.csv'.format(ind=ind), index=False)
    val.to_csv('folds/val_folds_{ind}.csv'.format(ind=ind), index=False)

In [3]:
# Import Data
DATA_FOLDER = "../data/"
train1 = pd.read_csv(DATA_FOLDER + 'folds/train_folds_1.csv')
train2 = pd.read_csv(DATA_FOLDER + 'folds/train_folds_2.csv')
train3 = pd.read_csv(DATA_FOLDER + 'folds/train_folds_3.csv')
train4 = pd.read_csv(DATA_FOLDER + 'folds/train_folds_4.csv')
train5 = pd.read_csv(DATA_FOLDER + 'folds/train_folds_5.csv')

fold1 = pd.read_csv(DATA_FOLDER + 'folds/val_folds_1.csv')
fold2 = pd.read_csv(DATA_FOLDER + 'folds/val_folds_2.csv')
fold3 = pd.read_csv(DATA_FOLDER + 'folds/val_folds_3.csv')
fold4 = pd.read_csv(DATA_FOLDER + 'folds/val_folds_4.csv')
fold5 = pd.read_csv(DATA_FOLDER + 'folds/val_folds_5.csv')

train_all = pd.read_csv(DATA_FOLDER + 'folds/trainval.csv')
testset = pd.read_csv(DATA_FOLDER + 'folds/test.csv')

# store in suitable data structure
data = [(train1, fold1), (train2, fold2),(train3, fold3), (train4, fold4), (train5, fold5), (train_all, testset)]

## LR

In [None]:
# Instantiate model grid that gives highest validtion weighted F1 (Class 1)
lr_params = {
    "C": [0.1],
    'class_weight': ['balanced'],
    'penalty':['l2'],
    'solver':['lbfgs']
}

lr_paramgrid = list(ParameterGrid(lr_params))

# Best text processing
column = 'sentence'

# Model Function
model_fn = LogisticRegression


In [None]:
custom_k_fold(lr_paramgrid, column, data, "LR_BERT", model_fn)

## SVM

Not used for stacking

In [None]:
# Instantiate model grid that gives highest validtion weighted F1 (Class 1)
svm_params = {
    "C": [0.1],
    "kernel": ["poly"],
    "gamma": ["scale"],
    "class_weight": ['balanced'],
    "probability": [True] # must be true
}

svm_paramgrid = list(ParameterGrid(svm_params))


#Best text processing
column = 'sentence'

# Model Function
model_fn = SVC

In [None]:
custom_k_fold(svm_paramgrid, column, data, "SVM_BERT", model_fn)