# Create the model on data without instances with no epithet

In [9]:
## import necessary libraries
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

from helper_functions import anova, chi_square, tfidf_vector
from helper_functions import model_features
from numpy import append
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [10]:
## read in data
instances = pd.read_csv('data/ships_extended.csv', index_col=0)
instances = instances[instances['epithet_en']!='none']
X_train, X_test, Y_train, Y_test = train_test_split(instances.drop(['epithet_gr', 'epithet_en', 'clause'], axis=1), instances['epithet_en'], test_size=0.2, random_state=20016)

## Model features excluding trigrams

In [11]:
## split again to evaulate best model
x_f_train, x_f_test, y_train, y_test = train_test_split(X_train.drop('trigrams', axis=1), Y_train, test_size=0.2, random_state=20016)

In [12]:
## featurise
cat = ['ship', 'number', 'case', 'scansion', 'book', 'position']
num = ['num_lines', 'difference', 'line', 'ratio']
x_chi_train, x_chi_test = chi_square(x_f_train[cat], y_train, x_f_test[cat], 3)
x_anova_train, x_anova_test = anova(x_f_train[num], y_train, x_f_test[num])

x_both_train = []
x_both_test = []
for i in range(len(x_chi_train)):
    x_both_train.append(np.append(x_chi_train[i],x_anova_train[i]))
for i in range(len(x_chi_test)):
    x_both_test.append(np.append(x_chi_test[i], x_anova_test[i]))

In [13]:
## model
print('Raw')
model_features(x_f_train, x_f_test, y_train, y_test)

print('Chi square')
model_features(x_chi_train, x_chi_test, y_train, y_test)

print('ANOVA')
model_features(x_anova_train, x_anova_test, y_train, y_test)

print('Both')
model_features(x_both_train, x_both_test, y_train, y_test)

Raw
Baseline 0-R: 0.4
One-R: 0.4
Decision tree: 0.4
MLR: 0.4
Chi square
Baseline 0-R: 0.4
One-R: 0.4
Decision tree: 0.27
MLR: 0.4
ANOVA
Baseline 0-R: 0.4
One-R: 0.4
Decision tree: 0.41000000000000003
MLR: 0.45
Both
Baseline 0-R: 0.4
One-R: 0.4
Decision tree: 0.26000000000000006
MLR: 0.45


## Modelling Trigrams

In [14]:
## split for trigrams
x_t_train, x_t_test, y_train, y_test = train_test_split(X_train['trigrams'], Y_train, test_size=0.2, random_state=20016)

In [15]:
## featurise
x_tfidf_train, x_tfidf_test = tfidf_vector(x_t_train, x_t_test)
x_t_c_train, x_t_c_test = chi_square(x_tfidf_train, y_train, x_tfidf_test, 60)

In [16]:
## model
print('TFIDF')
model_features(x_tfidf_train, x_tfidf_test, y_train, y_test)

print('TFIDF and Chi Square')
model_features(x_t_c_train, x_t_c_test, y_train, y_test)

TFIDF
Baseline 0-R: 0.4
One-R: 0.30000000000000004
Decision tree: 0.26000000000000006
MLR: 0.4
TFIDF and Chi Square
Baseline 0-R: 0.4
One-R: 0.4
Decision tree: 0.4
MLR: 0.4


## Stacking models

In [17]:
label = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

predictions_train = []
predictions_test = []

In [18]:
## decision tree anova 
tree = DecisionTreeClassifier(max_depth=5)
tree = tree.fit(x_anova_train, y_train)

preds = tree.predict(x_anova_train)
mapper = label.fit_transform(preds)
mapper = mapper.reshape(len(mapper),1)
arr = onehot_encoder.fit_transform(mapper)
preds = []
for i in range(len(arr)):
    for j in range(len(arr[i])):
        if arr[i][j]==1:
            preds.append(j)
predictions_train.append(preds)

preds = tree.predict(x_anova_test)
mapper = label.fit_transform(preds)
mapper = mapper.reshape(len(mapper),1)
arr = onehot_encoder.fit_transform(mapper)
preds = []
for i in range(len(arr)):
    for j in range(len(arr[i])):
        if arr[i][j]==1:
            preds.append(j)
predictions_test.append(preds)

In [19]:
## mlr anova 
lgr = LogisticRegression(C=1, penalty='l1', solver='saga')
lgr = lgr.fit(x_anova_train, y_train)

preds = lgr.predict(x_anova_train)
mapper = label.fit_transform(preds)
mapper = mapper.reshape(len(mapper),1)
arr = onehot_encoder.fit_transform(mapper)
preds = []
for i in range(len(arr)):
    for j in range(len(arr[i])):
        if arr[i][j]==1:
            preds.append(j)
predictions_train.append(preds)

preds = lgr.predict(x_anova_test)
mapper = label.fit_transform(preds)
mapper = mapper.reshape(len(mapper),1)
arr = onehot_encoder.fit_transform(mapper)
preds = []
for i in range(len(arr)):
    for j in range(len(arr[i])):
        if arr[i][j]==1:
            preds.append(j)
predictions_test.append(preds)

In [20]:
## mlr both
lgr = LogisticRegression(C=1, penalty='l1', solver='saga')
lgr = lgr.fit(x_both_train, y_train)

preds = lgr.predict(x_both_train)
mapper = label.fit_transform(preds)
mapper = mapper.reshape(len(mapper),1)
arr = onehot_encoder.fit_transform(mapper)
preds = []
for i in range(len(arr)):
    for j in range(len(arr[i])):
        if arr[i][j]==1:
            preds.append(j)
predictions_train.append(preds)

preds = lgr.predict(x_both_test)
mapper = label.fit_transform(preds)
mapper = mapper.reshape(len(mapper),1)
arr = onehot_encoder.fit_transform(mapper)
preds = []
for i in range(len(arr)):
    for j in range(len(arr[i])):
        if arr[i][j]==1:
            preds.append(j)
predictions_test.append(preds)


In [21]:
## stacker
preds_train = pd.DataFrame(predictions_train).transpose()
preds_train.columns = ['features_oner', 'features_tree', 'tfidf_oner']

preds_test = pd.DataFrame(predictions_test).transpose()
preds_test.columns = ['features_oner', 'features_tree', 'tfidf_oner']

stacker = LogisticRegression()
stacker = stacker.fit(preds_train, y_train)
labels = stacker.predict(preds_test)
f1_score(y_test, labels, average='micro')

0.391304347826087