# Testing the classifier without instances with no epithet

In [22]:
## import necessary libraries
import pandas as pd

from helper_functions import anova
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [23]:
## read in data
instances = pd.read_csv('data/ships_extended.csv', index_col=0)
instances = instances[instances['epithet_en']!='none']
X_train, X_test, Y_train, Y_test = train_test_split(instances.drop(['epithet_gr', 'epithet_en', 'clause'], axis=1), instances['epithet_en'], test_size=0.2, random_state=20016)

## Train model

In [24]:
## featurise
num = ['num_lines', 'difference', 'line', 'ratio']
X_train, X_test = anova(X_train[num], Y_train, X_test[num])

In [25]:
## model
lgr = LogisticRegression(C=1, penalty='l1', solver='saga')
lgr = lgr.fit(X_train, Y_train)



## Predict epithet

In [26]:
## make feature dataframe
test = Y_test
test = test.to_frame()
difference = [x[0] for x in X_test]
ratio = [x[1] for x in X_test]
test['difference'] = difference
test['ratio'] = ratio

In [27]:
## add predictions to dataframe
test['prediction'] = lgr.predict(X_test)
test.sort_index()

Unnamed: 0,epithet_en,difference,ratio,prediction
20,swift,1.0,0.666667,swift
42,hollowed,1.0,0.5,black
53,black,1.0,0.5,black
58,swift,1.0,0.833333,swift
60,swift,2.0,0.666667,black
61,hollowed,-1.0,0.666667,swift
75,well-balanced,1.0,0.5,black
88,with oars,1.0,0.5,black
97,curved,3.0,0.333333,black
98,black,2.0,0.5,black


## Analyse model

In [28]:
## overall score
lgr.score(X_test, Y_test)

0.20689655172413793

In [29]:
## create accuracy dicy
epithets = {}
for epithet in test['epithet_en'].unique():
    epithets[epithet] = {'tp':0, 'fp':0, 'fn':0, 'tn':0}

for index, row in test.iterrows():
    if row['epithet_en'] == row['prediction']:
        for epithet in epithets.keys():
            if epithet==row['epithet_en']:
                epithets[epithet]['tp']+=1
            else:
                epithets[epithet]['tn']+=1
    else:
        for epithet in epithets.keys():
            if epithet==row['epithet_en']:
                epithets[epithet]['fn']+=1
            elif epithet==row['prediction']:
                epithets[epithet]['fp']+=1
            else:
                epithets[epithet]['tn']+=1

In [30]:
## accuracy
accuracy = []
fp = []
fn = []
for epithet in epithets.keys():
    acc = (epithets[epithet]['tp']+epithets[epithet]['tn'])/len(Y_test)
    accuracy.append([epithet, acc, epithets[epithet]['fp'], epithets[epithet]['fn']])

acc = pd.DataFrame(accuracy)
acc.columns = ['epithet_en', 'accuracy', 'false_positives', 'false_negatives']
acc.sort_values(by='accuracy', ascending=False)

Unnamed: 0,epithet_en,accuracy,false_positives,false_negatives
1,sea-swift,0.965517,0,1
6,red-cheeked,0.965517,0,1
7,with oars,0.965517,0,1
8,hollow,0.965517,1,0
9,well-made,0.931034,0,2
10,well-benched,0.931034,0,2
2,well-balanced,0.896552,0,3
5,curved,0.896552,0,3
0,hollowed,0.793103,0,6
4,swift,0.724138,5,3


In [31]:
## average precision and recall
num = 0
prec_denom = 0
recall_denom = 0

for epithet in epithets.keys():
    num += epithets[epithet]['tp']

    prec_denom += epithets[epithet]['tp']
    prec_denom += epithets[epithet]['fp']

    recall_denom += epithets[epithet]['tp']
    recall_denom += epithets[epithet]['fn']

print('Precision:', num/prec_denom)
print('Recall:', num/recall_denom)

Precision: 0.21428571428571427
Recall: 0.20689655172413793
