In [144]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, confusion_matrix
from top2vec import Top2Vec
from lib.classifiers import logistic_regression, sgd_classifier, sgd_huber_classifier
from lib.data import training_data, test_data
from lib.vectors import bert_vectors, gensim_vectors, top2vec_vectors

In [145]:
training_data = training_data()
test_data = test_data()

In [146]:
bert_train_vecs, bert_test_vecs = bert_vectors()
gensim_train_vecs, gensim_test_vecs = gensim_vectors()
t2v_train_vecs, t2v_test_vecs = top2vec_vectors()

In [147]:
X = np.array(t2v_train_vecs)
y = np.array(training_data.target)
y = np.pad(y, pad_width=((463,0)), mode='constant')

X_test = np.array(t2v_test_vecs)
y_test = np.array(test_data.target)

In [148]:
# X_test = np.array(gensim_test_vecs)
# y_test = np.array(test_data.target)
# X_test = np.pad(X_test, pad_width=((0,0),(0,10)), mode='constant')
# X = np.array(gensim_train_vecs)
# X = np.pad(X, pad_width=((0,0),(0,10)), mode='constant')

# y = np.array(training_data.target)
# y = np.pad(y, pad_width=((463,0)), mode='constant')

In [149]:
# X_test = np.array(bert_test_vecs)
# y_test = np.array(test_data.target)
# X = np.array(bert_train_vecs)

# y = np.array(training_data.target)
# y = np.pad(y, pad_width=((463,0)), mode='constant')

In [150]:
cv_lr_f1_train, cv_lrsgd_f1_train, cv_svcsgd_f1_train,  = [], [], []


lr_train, y_val_lr_train, X_val_scale_lr_train, lr_scaler_train = logistic_regression(X, y)
y_pred_lr_train = lr_train.predict(lr_scaler_train.transform(X_val_scale_lr_train))
cv_lr_f1_train.append(f1_score(y_val_lr_train, y_pred_lr_train, average='weighted'))


sgd_train, y_val_sgd_train, X_val_scale_sgd_train, sgd_scaler_train = sgd_classifier(X, y)
y_pred_sgd_train = sgd_train.predict(sgd_scaler_train.transform(X_val_scale_sgd_train))
cv_lrsgd_f1_train.append(f1_score(y_val_sgd_train, y_pred_sgd_train, average="weighted"))


sgd_huber_train, y_val_sgd_huber_train, X_val_scale_sgd_huber_train, sgd_huber_scaler_train = sgd_huber_classifier(X, y)
y_pred_sgd_huber_train = sgd_huber_train.predict(sgd_huber_scaler_train.transform(X_val_scale_sgd_huber_train))
cv_svcsgd_f1_train.append(f1_score(y_val_sgd_huber_train, y_pred_sgd_huber_train, average="weighted"))


print(f'Train Logistic Regression Val f1: {np.mean(cv_lr_f1_train):.3f} +- {np.std(cv_lr_f1_train):.3f}')
print(f'Train Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1_train):.3f} +- {np.std(cv_lrsgd_f1_train):.3f}')
print(f'Train SVM Huber Val f1: {np.mean(cv_svcsgd_f1_train):.3f} +- {np.std(cv_svcsgd_f1_train):.3f}')



Train Logistic Regression Val f1: 0.587 +- 0.000
Train Logisitic Regression SGD Val f1: 0.590 +- 0.000
Train SVM Huber Val f1: 0.642 +- 0.000


In [151]:
cv_lr_f1_test, cv_lrsgd_f1_test, cv_svcsgd_f1_test,  = [], [], []


lr_test, y_val_lr_test, X_val_scale_lr_test, lr_scaler_test = logistic_regression(X_test, y_test)
y_pred_lr_test = lr_test.predict(lr_scaler_test.transform(X_val_scale_lr_test))
cv_lr_f1_test.append(f1_score(y_val_lr_test, y_pred_lr_test, average='weighted'))


sgd_test, y_val_sgd_test, X_val_scale_sgd_test, sgd_scaler_test = sgd_classifier(X_test, y_test)
y_pred_sgd_test = sgd_test.predict(sgd_scaler_test.transform(X_val_scale_sgd_test))
cv_lrsgd_f1_test.append(f1_score(y_val_sgd_test, y_pred_sgd_test, average="weighted"))


sgd_huber_test, y_val_sgd_huber_test, X_val_scale_sgd_huber_test, sgd_huber_scaler_test = sgd_huber_classifier(X_test, y_test)
y_pred_sgd_huber_test = sgd_huber_test.predict(sgd_huber_scaler_test.transform(X_val_scale_sgd_huber_test))
cv_svcsgd_f1_test.append(f1_score(y_val_sgd_huber_test, y_pred_sgd_huber_test, average="weighted"))


print(f'Test Logistic Regression Val f1: {np.mean(cv_lr_f1_test):.3f} +- {np.std(cv_lr_f1_test):.3f}')
print(f'Test Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1_test):.3f} +- {np.std(cv_lrsgd_f1_test):.3f}')
print(f'Test SVM Huber Val f1: {np.mean(cv_svcsgd_f1_test):.3f} +- {np.std(cv_svcsgd_f1_test):.3f}')

Test Logistic Regression Val f1: 0.538 +- 0.000
Test Logisitic Regression SGD Val f1: 0.527 +- 0.000
Test SVM Huber Val f1: 0.583 +- 0.000


In [152]:
test_data["lr_scores"] = lr_train.predict(lr_scaler_train.transform(X_test))
test_data["lr_scores_prob_1"] = lr_train.predict_proba(lr_scaler_train.transform(X_test))[:, 1]

test_data["sgd_scores"] = sgd_train.predict(sgd_scaler_train.transform(X_test))
test_data["sgd_scores_prob_1"] = sgd_train.predict_proba(sgd_scaler_train.transform(X_test))[:, 1]

test_data["sgd_huber_scores"] = sgd_huber_train.predict(sgd_huber_scaler_train.transform(X_test))
test_data["sgd_huber_scores_prob_1"] = sgd_huber_train.predict_proba(sgd_huber_scaler_train.transform(X_test))[:, 1]

In [153]:
test_pred_df = test_data[test_data.target.astype(str).isin(["0"])]
test_pred_df.lr_scores.value_counts(normalize=True)


0    0.581907
1    0.418093
Name: lr_scores, dtype: float64

In [154]:
y_pred_lr_train = np.pad(y_pred_lr_train, pad_width=((155,0)), mode='constant')
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr_train).ravel()

270