In [51]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

from utils import load_dataset_split
from feature_extractor import FeatureExtractor
from log_regression import LogReg
from mlp import MLP


train_x, train_y, test_x, test_y = load_dataset_split()

feature_list = {"pos_tag", "word_vector", "glove_embedding"}
vect = "count"
vect_pca = True
vect_filter = True

feature_ext = FeatureExtractor(feature_list=feature_list, word_vectorizer=vect, vector_pca=vect_pca, vector_filter=vect_filter)

train_feat = feature_ext.extract_features(train_x, train=True)
test_feat = feature_ext.extract_features(test_x, train=False)

## Log Reg

In [52]:
model_lr = LogReg()

model_lr.train(train_feat, train_y)
lr_test_pred = model_lr.predict(test_feat)

print(f"Model: LogReg", f", Features: {feature_list}")
if "word_vector" in feature_list:
    print(f"Vectorizer: {vect}") 
    print(f"Vector PCA: {vect_pca}")
    print(f"Vector Remove Stop Words / Numbers: {vect_filter}")
print("Accuracy: ", accuracy_score(test_y, lr_test_pred))
print("F1 Score: ", f1_score(test_y, lr_test_pred, average="macro"))

Model: LogReg , Features: {'word_vector', 'pos_tag', 'glove_embedding'}
Vectorizer: count
Vector PCA: True
Vector Remove Stop Words / Numbers: True
Accuracy:  0.7963460505104782
F1 Score:  0.7673413870701756


## MLP

In [53]:
print(f"Input Size: {len(train_feat.to_numpy()[0])}")

Input Size: 212


In [58]:
f1_scores = []

for i in range(5):
    score = []
    model_mlp = MLP([len(train_feat.to_numpy()[0])])

    print(f"Iteration {i}")
    for j in range(8):
        model_mlp.train(train_feat, train_y, epochs=5, print=0)
        mlp_test_pred = model_mlp.predict(test_feat)
        f1 = f1_score(test_y, mlp_test_pred, average="macro")
        print(f"Epoch: {j * 5}")
        print("F1 Score: ", f1)
        score.append(f1)


    print(f"Model: MLP", f", Features: {feature_list}")
    if "word_vector" in feature_list:
        print(f"Vectorizer: {vect}") 
        print(f"Vector PCA: {vect_pca}")
        print(f"Vector Remove Stop Words / Numbers: {vect_filter}")
    print("Accuracy: ", accuracy_score(test_y, mlp_test_pred))
    print("F1 Score: ", f1)

    f1_scores.append(score)




Iteration 0
Epoch: 0
F1 Score:  0.7299658256529816
Epoch: 5
F1 Score:  0.7458682612140918
Epoch: 10
F1 Score:  0.7424956842117177
Epoch: 15
F1 Score:  0.7587300762374732
Epoch: 20
F1 Score:  0.759203498040657
Epoch: 25
F1 Score:  0.7531400885816361
Epoch: 30
F1 Score:  0.7541852587050327
Epoch: 35
F1 Score:  0.7625892847957448
Model: MLP , Features: {'word_vector', 'pos_tag', 'glove_embedding'}
Vectorizer: count
Vector PCA: True
Vector Remove Stop Words / Numbers: True
Accuracy:  0.7931219774314885
F1 Score:  0.7625892847957448
Iteration 1
Epoch: 0
F1 Score:  0.7332101988115305
Epoch: 5
F1 Score:  0.7483907372918209
Epoch: 10
F1 Score:  0.7512590695689286
Epoch: 15
F1 Score:  0.7610716846370528
Epoch: 20
F1 Score:  0.7501299904692367
Epoch: 25
F1 Score:  0.7463748150503826
Epoch: 30
F1 Score:  0.754796623986335
Epoch: 35
F1 Score:  0.7408953535199588
Model: MLP , Features: {'word_vector', 'pos_tag', 'glove_embedding'}
Vectorizer: count
Vector PCA: True
Vector Remove Stop Words / Number

In [59]:
for i in range(8):
    s = 0
    for j in range(5):
        s += f1_scores[j][i]
    print(s / 5)

0.7222706088170062
0.7444361342010792
0.7488699590161256
0.7601210644653495
0.7591744161878856
0.7552338838017023
0.7544230577435257
0.7540864278400383
