In [2]:
import pandas as pd
import numpy as np
from preprocessing import *
import re
from tqdm import tqdm

from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, cross_validate

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

import pickle


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xinminaw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xinminaw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#df1 = pd.read_csv('../data/all_labelled_17Oct.csv')
df1 = pd.read_csv('../data/labelled_data/all_labelled_17Oct.csv')

# Preprocess

In [4]:
df1.columns = ['index', 'sentence', 'relevance', 'carbon_class']
df1['cleaned_sentence'] = df1['sentence'].apply(clean_sentence)

# Split labelled set

In [5]:
train, val, test = \
              np.split(df1.sample(frac=1, random_state=4103), 
                       [int(.6*len(df1)), int(.8*len(df1))])
trainval =pd.concat([train, val])# Split labelled set
labels = [train.relevance, val.relevance, test.relevance, trainval.relevance]

# Parameters

In [6]:
vect_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1), (1,2), (1,3)],
    "max_df": [0.25, 0.5, 1.0],
    "min_df": [1, 10, 20]
}
vect_paramgrid = list(ParameterGrid(vect_params))

In [7]:
vect_methods = {'type': ['bow_raw', 'tfidf_clean']} #, 'processing': ['clean','raw']}
vect_methods_paramgrid = list(ParameterGrid(vect_methods))

In [8]:
# logistic regression
logreg_params = { "C": [0.1, 0.5, 1.0, 5], 
                 "solver": ["lbfgs", "newton-cg"], 
                 "penalty": ["l2", "none"],
                 "class_weight": ["balanced", None]}
logreg_paramgrid = list(ParameterGrid(logreg_params))

# naive bayes
nb_params = {"alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]}
nb_paramgrid = list(ParameterGrid(nb_params))

# svm
svm_params = { "C": [0.1, 0.5, 1, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

# rf
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["sqrt", "log2"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

# dummy classifier
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

# Grid Search

In [9]:
#Oversampling the data
def oversample_ros(X,y):
    ros = RandomOverSampler(random_state = 4103)
    X, y = ros.fit_resample(X, y)
    return X,y

def oversample_smote(X,y):
    smote = SMOTE(random_state = 4103)
    X, y = smote.fit_resample(X, y)
    return X,y

In [10]:
def vectorize_helper(vect, sentence_version):
    vec_train = vect.fit_transform(train[sentence_version])
    vec_val = vect.transform(val[sentence_version])
    vec_test = vect.transform(test[sentence_version])
    vec_trainval = vect.transform(trainval[sentence_version])
    
    vec_train_oversampled = oversample_smote(vec_train, labels[0])
    vec_trainval_oversampled = oversample_smote(vec_trainval, labels[3])
    return vec_train_oversampled, vec_val, vec_test, vec_trainval_oversampled

In [11]:
def vectorize_dataset(vect_param):
    # bag of words
    
    # raw
    bow = CountVectorizer(**vect_param)
    bow_train_oversampled, bow_val, bow_test, bow_trainval_oversampled = vectorize_helper(bow, 'sentence')

#     # cleaned
#     bow = CountVectorizer(**vect_param)
#     bow_train_clean_oversampled, bow_clean_val, bow_clean_test, bow_trainval_clean_oversampled = vectorize_helper(bow, 'cleaned_sentence')
    
    # tfidf
    
#     # raw
#     tfidf = TfidfVectorizer(**vect_param)
#     tfidf_train_oversampled, tfidf_val, tfidf_test, tfidf_trainval_oversampled = vectorize_helper(tfidf, 'sentence')
    
    # cleaned
    tfidf = TfidfVectorizer(**vect_param)    
    tfidf_train_clean_oversampled, tfidf_clean_val, tfidf_clean_test, tfidf_trainval_clean_oversampled = vectorize_helper(tfidf, 'cleaned_sentence')

    return {'bow_raw': [bow_train_oversampled, bow_val, bow_test, bow_trainval_oversampled], #{'clean': [bow_train_clean_oversampled, bow_clean_val, bow_clean_test, bow_trainval_clean_oversampled],
                    #'raw': [bow_train_oversampled, bow_val, bow_test, bow_trainval_oversampled]},
            'tfidf_clean': [tfidf_train_clean_oversampled, tfidf_clean_val, tfidf_clean_test, tfidf_trainval_clean_oversampled] #{'clean': [tfidf_train_clean_oversampled, tfidf_clean_val, tfidf_clean_test, tfidf_trainval_clean_oversampled],
                     #'raw':[tfidf_train_oversampled, tfidf_val, tfidf_test, tfidf_trainval_oversampled]}
           }

In [12]:
def hyperparam_search(model, model_function, model_grid):
    ind = 0 
    gridsearch_results = []
    for vect_param in tqdm(vect_paramgrid):
        dataset = vectorize_dataset(vect_param)
        for vect_choice in vect_methods_paramgrid:
            vect_type  = vect_choice['type']
#             sentence_proc = vect_choice['processing']
            train_hp = dataset[vect_type][0][0]
            val_hp = dataset[vect_type][1]
            test_hp = dataset[vect_type][2]
            trainval_hp = dataset[vect_type][3][0]

            train_label = dataset[vect_type][0][1]
            val_label = labels[1]
            test_label = labels[2]
            trainval_label = dataset[vect_type][3][1]

            for model_param in model_grid:
                # fit model on train set
                model = model_function(**model_param)
                model.fit(train_hp, train_label)
                val_pred = model.predict(val_hp)

                # scoring
                val_metrics = classification_report(val_label, val_pred, output_dict=True)
                val_accuracy = val_metrics["accuracy"]
                val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
                val_f1_zero = val_metrics["0"]["f1-score"]
                val_f1_one = val_metrics["1"]["f1-score"]

                # fit model on trainval set
                model = model_function(**model_param)
                model.fit(trainval_hp, trainval_label)
                test_pred = model.predict(test_hp)

                # scoring
                test_metrics = classification_report(test_label, test_pred, output_dict=True)
                test_accuracy = test_metrics["accuracy"]
                test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
                test_f1_zero = test_metrics["0"]["f1-score"]
                test_f1_one = test_metrics["1"]["f1-score"]

                results = {"model": model}
                results.update(vect_param)
                results.update(vect_choice)
                results.update(model_param)
                results.update({"val_f1_weighted": val_f1_weighted,  
                                "val_f1_zero": val_f1_zero,
                                "val_f1_one": val_f1_one,
                                "val_accuracy": val_accuracy})
                results.update({"test_f1_weighted": test_f1_weighted, 
                                "test_f1_zero": test_f1_zero,
                                "test_f1_one": test_f1_one,
                                "test_accuracy": test_accuracy})
                gridsearch_results.append(results)
                ind += 1
    final_results = pd.DataFrame.from_records(gridsearch_results)
    final_results = final_results.sort_values(by=["val_f1_one", "test_f1_one"], ascending=False)
#    final_results = final_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)

    return final_results

# Log Reg

In [None]:
final_lr_results = hyperparam_search("log_reg", LogisticRegression, logreg_paramgrid)
final_lr_results.to_csv("model_results/bowtfidf/logreg_smote.csv", index=False)


# Naive Bayes

In [None]:
final_nb_results = hyperparam_search("nb", MultinomialNB, nb_paramgrid)
final_nb_results.to_csv("model_results/bowtfidf/nb.csv", index=False)

# SVM

In [None]:
final_svm_results = hyperparam_search("svm", SVC, svm_paramgrid)
final_svm_results.to_csv("model_results/bowtfidf/svm_smote.csv", index=False)

# RF

In [None]:
final_rf_results = hyperparam_search("rf", RandomForestClassifier, rf_paramgrid)
final_rf_results.to_csv("model_results/bowtfidf/rf.csv", index=False)

# Base Classifier

In [None]:
final_dummy_results = hyperparam_search("dummy", DummyClassifier, dummy_paramgrid)
final_dummy_results.to_csv("model_results/bowtfidf/dummy_ros.csv", index=False)

# Generate Fold Predictions

In [35]:
def custom_k_fold(model_grid, vectorizer_grid, column, data, model_name, vectFunc, model_fn):
    
    # Generate fold predictions
    fold_num = 1
    for tf_combi in data:
        train = tf_combi[0]
        predict_on = tf_combi[1]
        predict_label = predict_on.relevance
        
        # Vectorise Data
        for v in vectorizer_grid:
            vectorizer = vectFunc(**v)
        vec_train = vectorizer.fit_transform(train[column])
        vec_predict_on = vectorizer.transform(predict_on[column])
        
        # Get Labels
        train_label = train.relevance
        
        # Oversample
        vec_train_over, train_label_over = oversample_smote(vec_train, train_label)
        
        # Fit Model
        for m in model_grid:
            model = model_fn(**m)
        model.fit(vec_train_over, train_label_over)
        predictions = model.predict_proba(vec_predict_on)
        
        # Create Dataframe and output
        df = pd.DataFrame(data=predictions, columns = [model_name+'_prob_0', model_name+'_prob_1'])
        if model_name == 'SVM':
            df['relevance'] = predict_label
            
        if fold_num <=5:
            path = DATA_FOLDER + "fold_predictions/" + model_name + "/" + model_name + '_fold' + str(fold_num) +'.csv'
        else:
            path = DATA_FOLDER +"fold_predictions/" + model_name + "/" + model_name + '_test.csv'
        
        df.to_csv(path, index=False)
        
        fold_num +=1

In [30]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=4013, shuffle=True)

In [31]:
ind=1
for train_index, test_index in kf.split(trainval.reset_index()):
    tr = trainval.reset_index().iloc[train_index]
    val = trainval.reset_index().iloc[test_index]
    tr.to_csv('folds/train_folds_{ind}.csv'.format(ind=ind), index=False)
    val.to_csv('folds/val_folds_{ind}.csv'.format(ind=ind), index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'folds/train_folds_1.csv'

In [32]:
# Import Data
DATA_FOLDER = "../data/"
train1 = pd.read_csv(DATA_FOLDER + 'folds/train_folds_1.csv')
train2 = pd.read_csv(DATA_FOLDER + 'folds/train_folds_2.csv')
train3 = pd.read_csv(DATA_FOLDER +'folds/train_folds_3.csv')
train4 = pd.read_csv(DATA_FOLDER +'folds/train_folds_4.csv')
train5 = pd.read_csv(DATA_FOLDER +'folds/train_folds_5.csv')

fold1 = pd.read_csv(DATA_FOLDER +'folds/val_folds_1.csv')
fold2 = pd.read_csv(DATA_FOLDER +'folds/val_folds_2.csv')
fold3 = pd.read_csv(DATA_FOLDER +'folds/val_folds_3.csv')
fold4 = pd.read_csv(DATA_FOLDER +'folds/val_folds_4.csv')
fold5 = pd.read_csv(DATA_FOLDER +'folds/val_folds_5.csv')

train_all = pd.read_csv(DATA_FOLDER +'folds/trainval.csv')
testset = pd.read_csv(DATA_FOLDER +'folds/test.csv')

# store in suitable data structure
data = [(train1, fold1), (train2, fold2),(train3, fold3), (train4, fold4), (train5, fold5), (train_all, testset)]

In [None]:
data

# RF

In [47]:
# Instantiate model grid that gives highest validtion weighted F1 (Class 1)
rf_params = [
    {
        "criterion": ["entropy"],
        "min_samples_split": [5],
        "class_weight": ['balanced'],
        "max_features": ["log2"],
        "min_samples_leaf": [2]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

# Instantiate Vectorizer grid with Params giving highest validation weighted F1 (Class 1)
rf_vect_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,2)],
    "max_df": [0.25],
    "min_df": [10]
}
rf_vect_paramgrid = list(ParameterGrid(rf_vect_params))

# Best text processing
column = 'cleaned_sentence'

# Model Function
model_fn = RandomForestClassifier

In [38]:
custom_k_fold(rf_paramgrid, rf_vect_paramgrid, column, data, "RF", TfidfVectorizer, model_fn)

In [59]:
tfidf = TfidfVectorizer(**rf_vect_paramgrid[0])
tfidf_train = tfidf.fit_transform(df1.cleaned_sentence)

final_model = RandomForestClassifier(**rf_paramgrid[0])

# oversample
tfidf_train_over, tfidf_label_over = oversample_smote(tfidf_train, df1.relevance)

final_model.fit(tfidf_train_over, tfidf_label_over)

vect_pkl_filename = DATA_FOLDER + "saved_models/model_RF_vectorizer.pkl"
model_pkl_filename = DATA_FOLDER + "saved_models/model_RF.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)
with open(vect_pkl_filename, 'wb') as file:
    pickle.dump(tfidf, file)

# SVM

In [45]:
# Instantiate model grid that gives highest validtion weighted F1 (Class 1)
svm_params = {
    "C": [0.5],
    "kernel": ["sigmoid"],
    "gamma": ["scale"],
    "class_weight": ['balanced'],
    "probability": [True]
}

svm_paramgrid = list(ParameterGrid(svm_params))

# Instantiate Vectorizer grid with Params giving highest validation weighted F1 (Class 1)
svm_vect_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,2)],
    "max_df": [0.25],
    "min_df": [1]
}
svm_vect_paramgrid = list(ParameterGrid(svm_vect_params))

#Best text processing
column = 'cleaned_sentence'

# Model Function
model_fn = SVC

In [46]:
custom_k_fold(svm_paramgrid, svm_vect_paramgrid, column, data, "SVM", TfidfVectorizer, model_fn)

In [56]:
tfidf = TfidfVectorizer(**svm_vect_paramgrid[0])
tfidf_train = tfidf.fit_transform(df1.cleaned_sentence)

final_model = SVC(**svm_paramgrid[0])

# oversample
tfidf_train_over, tfidf_label_over = oversample_smote(tfidf_train, df1.relevance)

final_model.fit(tfidf_train_over, tfidf_label_over)

vect_pkl_filename = DATA_FOLDER + "saved_models/model_SVM_vectorizer.pkl"
model_pkl_filename = DATA_FOLDER + "saved_models/model_SVM.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)
with open(vect_pkl_filename, 'wb') as file:
    pickle.dump(tfidf, file)

# NB

In [41]:
# Instantiate model grid that gives highest validtion weighted F1 (Class 1)
nb_params = {
    "alpha": [0.5]
}

nb_paramgrid = list(ParameterGrid(nb_params))

# Instantiate Vectorizer grid with Params giving highest validation weighted F1 (Class 1)
nb_vect_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1)],
    "max_df": [0.25],
    "min_df": [1]
}
nb_vect_paramgrid = list(ParameterGrid(nb_vect_params))

# Best text processing
column = 'sentence'

# Model Function
model_fn = MultinomialNB

In [42]:
custom_k_fold(nb_paramgrid, nb_vect_paramgrid, column, data, "NB", CountVectorizer, model_fn)

In [57]:
bow = CountVectorizer(**nb_vect_paramgrid[0])
bow_train = bow.fit_transform(df1.cleaned_sentence)

final_model = MultinomialNB(**nb_paramgrid[0])

# oversample
bow_train_over, bow_label_over = oversample_smote(bow_train, df1.relevance)

final_model.fit(bow_train_over, bow_label_over)


vect_pkl_filename = DATA_FOLDER + "saved_models/model_NB_vectorizer.pkl"
model_pkl_filename = DATA_FOLDER + "saved_models/model_NB.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)
with open(vect_pkl_filename, 'wb') as file:
    pickle.dump(bow, file)

# LogReg

In [43]:
# Instantiate model grid that gives highest validtion weighted F1 (Class 1)
lr_params = {
    "C": [1],
    'class_weight': ['balanced'],
    'penalty':['l2'],
    'solver':['lbfgs']
}

lr_paramgrid = list(ParameterGrid(lr_params))

# Instantiate Vectorizer grid with Params giving highest validation weighted F1 (Class 1)
lr_vect_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1)],
    "max_df": [0.25],
    "min_df": [1]
}
lr_vect_paramgrid = list(ParameterGrid(lr_vect_params))

# Best text processing
column = 'cleaned_sentence'

# Model Function
model_fn = LogisticRegression

In [44]:
custom_k_fold(lr_paramgrid, lr_vect_paramgrid, column, data, "LR", TfidfVectorizer, model_fn)

In [58]:
tfidf = TfidfVectorizer(**lr_vect_paramgrid[0])
tfidf_train = tfidf.fit_transform(df1.cleaned_sentence)

final_model = LogisticRegression(**lr_paramgrid[0])

# oversample
tfidf_train_over, tfidf_label_over = oversample_smote(tfidf_train, df1.relevance)

final_model.fit(tfidf_train_over, tfidf_label_over)

vect_pkl_filename = DATA_FOLDER + "saved_models/model_LR_vectorizer.pkl"
model_pkl_filename = DATA_FOLDER + "saved_models/model_LR.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)
with open(vect_pkl_filename, 'wb') as file:
    pickle.dump(tfidf, file)