## Train Models

In [4]:
from sklearn.metrics import f1_score, accuracy_score

import utils
from feature_extractor import FeatureExtractor
from log_regression import LogReg

model_type = "LR"
feature_list = {"word_vector"}  # {"word_embedding", "word_vector", "pos_tag"}
vect = "count"
vect_pca = False

train_x, train_y, test_x, test_y = utils.load_dataset_split()

feature_ext = FeatureExtractor(feature_list=feature_list, word_vectorizer=vect, vector_pca=vect_pca, vector_filter=True)

train_feat = feature_ext.extract_features(train_x, train=True)
test_feat = feature_ext.extract_features(test_x, train=False)

model = LogReg()

model.train(train_feat, train_y)
test_pred = model.predict(test_feat)

print(f"Model: {model_type}", f", Features: {feature_list}" if model_type in {"MLP", "LogReg"} else "")
if "word_vector" in feature_list:
    print(f"Vectorizer: {vect}") 
    print(f"Vector PCA: {vect_pca}")
print("Accuracy: ", accuracy_score(test_y, test_pred))
print("F1 Score: ", f1_score(test_y, test_pred, average="macro"))

Model: LR 
Vectorizer: count
Vector PCA: False
Accuracy:  0.7743148844707146
F1 Score:  0.7490496883284102


In [7]:
input_sentence = "Our findings agree with recent work [69,70] where continual organic enrichment from farming processes resulted in increased macrofaunal abundances despite expectations of negative impacts from this contamination."
input_feat = feature_ext.extract_features([input_sentence]).values
print(input_feat[0])
print(model.predict(input_feat))


[0 0 0 ... 0 0 0]
['background']




In [14]:
vocab = feature_ext.vectorizer.vocabulary_

word_list = [w for w, _ in sorted(vocab.items(), key=lambda x: x[1])]
weights = model.model.coef_[2]

scores = [(word, weight) for word, idx, weight in zip(word_list, input_feat[0], weights) if idx > 0]

scores = sorted(scores, key=lambda x: x[1], reverse=True)

for w, i in scores:
    print(f"{w}: {i}")

findings: 1.4090013765407698
agree: 0.5147994869489517
resulted: 0.5007935899514159
despite: 0.4248115525849851
recent: 0.41758512182448454
work: 0.3453511232274969
processes: 0.1503482350107756
increased: 0.03920864965984831
abundances: 0.020158143374003113
organic: 0.004595936270514809
enrichment: -0.04803863140994574
negative: -0.055026143580853566
impacts: -0.05993656065020865
farming: -0.08091825069898732
expectations: -0.09757888725230988
contamination: -0.1591118584135347


## Get Most Important Words

In [2]:
vocab = feature_ext.vectorizer.vocabulary_

word_list = [w for w, _ in sorted(vocab.items(), key=lambda x: x[1])]

for class_name, weights in zip(model.model.classes_, model.model.coef_):
    sorted_words = [(word, weight) for word, weight in sorted(zip(word_list, weights), key=lambda x: x[1], reverse=True)]

    print(class_name)
    for word, weight in sorted_words[:20]:
        print(f"{word}: {weight}")
    print("")

background
animals: 1.041577023525401
term: 0.7800491045017149
dysfunction: 0.7767181388714247
still: 0.6772008798694836
list: 0.6715095311036127
lengths: 0.6672990139549911
much: 0.6527520836759497
typically: 0.6375042832646156
investigation: 0.6142547485657774
hand: 0.6086005707499181
up: 0.6083068046220632
expectation: 0.6072804065433052
epitopes: 0.599898560051005
need: 0.5964572909724665
onset: 0.5873570569982723
measurements: 0.5822848009582191
nine: 0.5819514463366982
appear: 0.5810911132941853
possibly: 0.5808327292767645
inferred: 0.578413380986326

method
used: 1.7131360357006968
method: 1.6881860201332763
using: 1.4443434828159694
methods: 1.142924393650429
we: 1.1399945979375496
procedure: 1.1362047318744883
technique: 1.1258847077679708
described: 1.1137084855999073
evaluated: 1.0210772958918055
applied: 1.0037976763757348
tnf: 0.9673417277832363
performed: 0.9618754556167153
utilized: 0.9422340125398933
measured: 0.8976678779311809
selected: 0.8903081032076429
were: 0.876

### Subtract Mean

In [3]:
import numpy as np
mean_weights = np.mean(model.model.coef_, axis=0)

norm_weights = model.model.coef_ - mean_weights

for class_name, weights in zip(model.model.classes_,norm_weights):
    sorted_words = [(word, weight) for word, weight in sorted(zip(word_list, weights), key=lambda x: x[1], reverse=True)]

    print(class_name)
    for word, weight in sorted_words[:20]:
        print(f"{word}: {weight}")
    print("")

background
animals: 1.0415770235253972
term: 0.7800491045017127
dysfunction: 0.7767181388714246
still: 0.6772008798694839
list: 0.6715095311036119
lengths: 0.6672990139549906
much: 0.6527520836759496
typically: 0.6375042832646161
investigation: 0.6142547485657788
hand: 0.6086005707499184
up: 0.6083068046220618
expectation: 0.6072804065433044
epitopes: 0.5998985600510064
need: 0.5964572909724656
onset: 0.5873570569982727
measurements: 0.5822848009582181
nine: 0.5819514463366988
appear: 0.5810911132941854
possibly: 0.580832729276764
inferred: 0.5784133809863267

method
used: 1.713136035700697
method: 1.688186020133271
using: 1.444343482815968
methods: 1.1429243936504314
we: 1.139994597937548
procedure: 1.1362047318744897
technique: 1.1258847077679721
described: 1.1137084855999029
evaluated: 1.0210772958918057
applied: 1.003797676375734
tnf: 0.9673417277832363
performed: 0.961875455616713
utilized: 0.9422340125398933
measured: 0.8976678779311803
selected: 0.8903081032076421
were: 0.876081