In [50]:
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer 
import re
from tqdm import tqdm
from imblearn.over_sampling import RandomOverSampler 

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# Split labelled set

In [97]:
df1 = pd.read_csv('../data/all_labelled_7Oct.csv')
train, val, test = \
              np.split(df1.sample(frac=1, random_state=4103), 
                       [int(.6*len(df1)), int(.8*len(df1))])
trainval =pd.concat([train, val])

In [98]:
X_train, y_train = train.relevant_sentences, train.relevant
X_val, y_val = val.relevant_sentences, val.relevant
X_trainval, y_trainval = trainval.relevant_sentences ,trainval.relevant
X_test, y_test = test.relevant_sentences ,test.relevant

# Oversample

In [99]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train.values.ravel())
X_train=vectorizer.transform(X_train.values.ravel())
X_val = vectorizer.transform(X_val.values.ravel())
X_trainval = vectorizer.transform(X_trainval.values.ravel())
X_test=vectorizer.transform(X_test.values.ravel())

X_val = X_val.toarray()
X_train=X_train.toarray()
X_trainval=X_trainval.toarray()
X_test=X_test.toarray()

In [100]:
#Oversampling the data
def oversample(X,y):
    ros = RandomOverSampler(random_state = 4103)
    X, y = ros.fit_resample(X, y)

    #Creating a new Oversampling Data Frame
    df_oversampled = pd.DataFrame(X)
    # df_oversampled['relevant']
    # sns.countplot(y)
    return df_oversampled, y

In [101]:
df_oversample_train, y_train = oversample(X_train, y_train)
df_oversample_trainval, y_trainval = oversample(X_trainval, y_trainval)

# Log Reg

In [115]:
logreg = LogisticRegression()
logreg.fit(df_oversample_trainval, y_trainval)
test_pred = logreg.predict(pd.DataFrame(X_test))

# scoring
test_metrics_lr = classification_report(y_test, test_pred, output_dict=True)
test_accuracy_lr = test_metrics_lr["accuracy"]
test_f1_weighted_lr = test_metrics_lr["weighted avg"]["f1-score"]
test_f1_zero_lr = test_metrics_lr["0"]["f1-score"]
test_f1_pos_lr = test_metrics_lr["1"]["f1-score"]

# Naive Bayes

In [118]:
nb = MultinomialNB()
nb.fit(df_oversample_trainval, y_trainval)
test_pred = nb.predict(pd.DataFrame(X_test))

# scoring
test_metrics_nb = classification_report(y_test, test_pred, output_dict=True)
test_accuracy_nb = test_metrics_nb["accuracy"]
test_f1_weighted_nb = test_metrics_nb["weighted avg"]["f1-score"]
test_f1_zero_nb = test_metrics_nb["0"]["f1-score"]
test_f1_pos_nb = test_metrics_nb["1"]["f1-score"]

# SVM

In [123]:
svc = SVC()
svc.fit(df_oversample_trainval, y_trainval)
test_pred = svc.predict(pd.DataFrame(X_test))

# scoring
test_metrics_svc = classification_report(y_test, test_pred, output_dict=True)
test_accuracy_svc = test_metrics_svc["accuracy"]
test_f1_weighted_svc = test_metrics_svc["weighted avg"]["f1-score"]
test_f1_zero_svc = test_metrics_svc["0"]["f1-score"]
test_f1_pos_svc = test_metrics_svc["1"]["f1-score"]

# RF

In [127]:
rf = RandomForestClassifier()
rf.fit(df_oversample_trainval, y_trainval)
test_pred = rf.predict(pd.DataFrame(X_test))

# scoring
test_metrics_rf = classification_report(y_test, test_pred, output_dict=True)
test_accuracy_rf = test_metrics_rf["accuracy"]
test_f1_weighted_rf = test_metrics_rf["weighted avg"]["f1-score"]
test_f1_zero_rf = test_metrics_rf["0"]["f1-score"]
test_f1_pos_rf = test_metrics_rf["1"]["f1-score"]

# Base Classifier

In [132]:
dummy = DummyClassifier()
dummy.fit(df_oversample_trainval, y_trainval)
test_pred = dummy.predict(pd.DataFrame(X_test))

# scoring
test_metrics_dummy = classification_report(y_test, test_pred, output_dict=True)
test_accuracy_dummy = test_metrics_dummy["accuracy"]
test_f1_weighted_dummy = test_metrics_dummy["weighted avg"]["f1-score"]
test_f1_zero_dummy = test_metrics_dummy["0"]["f1-score"]
test_f1_pos_dummy = test_metrics_dummy["1"]["f1-score"]

In [136]:
accuracy = [test_accuracy_dummy, test_accuracy_lr, test_accuracy_nb,test_accuracy_svc, test_accuracy_rf]
weight_f1 = [test_f1_weighted_dummy, test_f1_weighted_lr,test_f1_weighted_nb,test_f1_weighted_svc,test_f1_weighted_rf]