## Read in data

In [None]:
import os
import json
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from tqdm import tqdm
from imblearn.over_sampling import RandomOverSampler 

from sklearn.model_selection import ParameterGrid
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier


from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")


In [None]:
DATA_PATH = "../Data/labelled_data/all_labelled_17Oct.csv" 
data = pd.read_csv(DATA_PATH)
#data= pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSmEIfzUZLe4-7MyoUVW-NP0kndzRgsBQWe3WwBOxmU5wIQ75oRg1li3TImN1RFSSdxIO5K8T4h2n4E/pub?gid=0&single=true&output=csv")
data.columns = ['index', 'sentence', 'relevance', 'carbon_class']


In [None]:
data.info()

In [None]:
data.relevance.value_counts()

In [None]:
data.carbon_class.value_counts()

### Preprocess text before emedding it
1. dont remove numbers because we want to capture numbers
2. dont lemmatize etc because BERT dont need
1. ONLY remove all punctuations except except %,$,&
- upper/lower casing dont affect embeddings

In [None]:
def remove_numbers(string):
    return ''.join(i for i in string if not i.isdigit())

def remove_punc(s):
    import string
    exclude = string.punctuation
    final_punc = ''.join(list(i for i in exclude if i not in ['%', '$', '&']))
    s = ''.join(ch for ch in s if ch not in list(final_punc))
    return s


In [None]:
data.sentence = data.sentence.map(remove_punc)

In [None]:
train, val, test = \
              np.split(data.sample(frac=1, random_state=4103), 
                       [int(.6*len(data)), int(.8*len(data))])
trainval =pd.concat([train, val])
labels = [train.relevance, val.relevance, test.relevance, trainval.relevance]

### BERT embeddings


In [None]:
%tensorflow_version 1.x

!pip install bert-serving-client
!pip install -U bert-serving-server[http]

!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip
!nohup bert-serving-start -model_dir=./uncased_L-12_H-768_A-12 > out.file 2>&1 &


!ls  # you should see uncased_something_.zip



In [None]:
vect_methods = {'type': ['bert_as_a_service']}
vect_methods_paramgrid = list(ParameterGrid(vect_methods))

In [None]:
# logistic regression
logreg_params = { "C": [0.1, 0.5, 1.0, 1.5, 5], 
                 "solver": ["lbfgs", "newton-cg"], 
                 "penalty": ["l2", "none"],
                 "class_weight": ["balanced", None]}
logreg_paramgrid = list(ParameterGrid(logreg_params))

# naive bayes
nb_params = {"alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]}
nb_paramgrid = list(ParameterGrid(nb_params))

# svm
svm_params = { "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

# rf
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["auto","sqrt"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

# dummy classifier
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

### Grid Search

In [None]:
#Oversampling the data
def oversample(X,y):
    ros = RandomOverSampler(random_state = 4103,sampling_strategy=1.0)
    X, y = ros.fit_resample(X, y)
    return X,y

In [None]:
def vectorize_dataset():
    # BERT as a Service
    from bert_serving.client import BertClient
    bc = BertClient(check_length=False)

    bert_train = bc.encode(list(train.sentence))
    bert_val = bc.encode(list(val.sentence))
    bert_test = bc.encode(list(test.sentence))
    bert_trainval = bc.encode(list(trainval.sentence))
    
    # oversample minority class
    bert_train_oversampled = oversample(bert_train, labels[0])
    bert_trainval_oversampled = oversample(bert_trainval, labels[3])
    
    return {'bert_as_a_service': [bert_train_oversampled, bert_val, bert_test, bert_trainval_oversampled]}



In [None]:
def hyperparam_search(dataset,model, model_function, model_grid):
    ind = 0 
    gridsearch_results = []
    dataset = dataset
    for vect_choice in vect_methods_paramgrid:
        vect = vect_choice['type']
        train_hp = dataset[vect][0][0]
        val_hp = dataset[vect][1]
        test_hp = dataset[vect][2]
        trainval_hp = dataset[vect][3][0]

        train_label = dataset[vect][0][1]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = dataset[vect][3][1]

        for model_param in model_grid:
            # fit model on train set
            model = model_function(**model_param)
            model.fit(train_hp, train_label)
            val_pred = model.predict(val_hp)

            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_zero = val_metrics["0"]["f1-score"]
            val_f1_one = val_metrics["1"]["f1-score"]

            # fit model on trainval set
            model = model_function(**model_param)
            model.fit(trainval_hp, trainval_label)
            test_pred = model.predict(test_hp)

            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_zero = test_metrics["0"]["f1-score"]
            test_f1_one = test_metrics["1"]["f1-score"]

            results = {"model": model}
            results.update(vect_choice)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted,  
                            "val_f1_zero": val_f1_zero,
                            "val_f1_one": val_f1_one,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, 
                            "test_f1_zero": test_f1_zero,
                            "test_f1_one": test_f1_one,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
    final_results = pd.DataFrame.from_records(gridsearch_results)
    final_results = final_results.sort_values(by=["val_f1_one", "test_f1_one"], ascending=False)
    return final_results

# Log Reg
# BERT AS A SERVICE TAKES TIME TO ENCODE

In [None]:
bert_embeddings = vectorize_dataset()
final_lr_results = hyperparam_search(bert_embeddings,"log_reg", LogisticRegression, logreg_paramgrid)
#final_lr_results.to_csv("model_results/bowtfidf/logreg.csv", index=False)

In [None]:
final_lr_results.model[0]

# Naive Bayes 
- word embeddings have negative values which cannot be utilised for Naive Bayes

Negative values in data passed to MultinomialNB (input X)

In [None]:
final_nb_results = hyperparam_search("nb", MultinomialNB, nb_paramgrid)
#final_nb_results.to_csv("model_results/bowtfidf/nb.csv", index=False)

# SVM

In [None]:
final_svm_results = hyperparam_search(bert_embeddings,"svm", SVC, svm_paramgrid)
#final_svm_results.to_csv("model_results/bowtfidf/svm.csv", index=False)
final_svm_results

In [None]:
final_svm_results

# RF

In [None]:
final_rf_results = hyperparam_search(bert_embeddings,"rf", RandomForestClassifier, rf_paramgrid)
#final_rf_results.to_csv("model_results/bowtfidf/rf.csv", index=False)

# Base Classifier

In [None]:
final_dummy_results = hyperparam_search(bert_embeddings,"dummy", DummyClassifier, dummy_paramgrid)
#final_dummy_results.to_csv("model_results/bowtfidf/dummy.csv", index=False)

# Stacking

In [None]:
# get a stacking ensemble of models
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('lr', LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                                         fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                                         max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                                         random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                                         warm_start=False)))
	level0.append(('svm', SVC(C=0.1, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0, 
                           decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
                           max_iter=-1, probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False)))

	# define meta learner model
	level1 = LogisticRegression()
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model


# get a list of best models to evaluate on 
def get_models():
    models = dict()	
    models['lr'] = LogisticRegression(C=0.1, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0,warm_start=False)	
    models['svm'] = SVC(C=0.1, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
                        max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
    models['stacking'] = get_stacking()
    return models


# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring=make_scorer(f1_score, pos_label=1), cv=cv, n_jobs=-1, error_score='raise')
	return scores

In [None]:
from numpy import mean,std
from sklearn.metrics import make_scorer

# get the models to evaluate
X, y = bert_embeddings["bert_as_a_service"][3][0] , bert_embeddings["bert_as_a_service"][3][1]
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model, X, y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))


### carbon classes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer='word', stop_words = 'english')
for i in range(5):
    print(f"Class {i}")
    X_train = data.loc[data.carbon_class == float(i)].sentence
    # fit_transform on training val data
    X_traintfidf = tfidf.fit_transform(X_train)
    terms = tfidf.get_feature_names()

    # sum tfidf frequency of each term through documents
    sums = X_traintfidf.sum(axis=0)

    # connecting term to its sums frequency
    df = []
    for col, term in enumerate(terms):
        df.append( (term, sums[0,col] ))

    ranking = pd.DataFrame(df, columns=['term','rank'])
    print(ranking.sort_values('rank', ascending=False)[:10])