# Testing the classifier

In [53]:
## import necessary libraries
import pandas as pd

from helper_functions import anova
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [54]:
## read in data
instances = pd.read_csv('data/ships_extended.csv', index_col=0)
X_train, X_test, Y_train, Y_test = train_test_split(instances.drop(['epithet_gr', 'epithet_en', 'clause'], axis=1), instances['epithet_en'], test_size=0.2, random_state=20016)

## Train model

In [55]:
## featurise
num = ['num_lines', 'difference', 'line', 'ratio']
X_train, X_test = anova(X_train[num], Y_train, X_test[num])

In [56]:
## model
tree = DecisionTreeClassifier(max_depth=5)
tree = tree.fit(X_train, Y_train)

## Predict epithet

In [57]:
## make feature dataframe
test = Y_test
test = test.to_frame()
difference = [x[0] for x in X_test]
ratio = [x[1] for x in X_test]
test['difference'] = difference
test['ratio'] = ratio

In [58]:
## add predictions to dataframe
test['prediction'] = tree.predict(X_test)
test.sort_index()

Unnamed: 0,epithet_en,difference,ratio,prediction
8,none,0.0,0.5,none
9,hollow,-2.0,0.666667,swift
12,none,0.0,0.666667,none
14,swift,1.0,0.666667,swift
21,swift,1.0,0.833333,swift
32,swift,-2.0,0.666667,swift
36,none,0.0,0.5,none
47,none,0.0,0.5,none
48,hollow,-2.0,0.333333,hollow
54,black,1.0,0.666667,swift


## Analyse model

In [59]:
## overall score
tree.score(X_test, Y_test)

0.6785714285714286

In [61]:
## create accuracy dicy
epithets = {}
for epithet in test['epithet_en'].unique():
    epithets[epithet] = {'tp':0, 'fp':0, 'fn':0, 'tn':0}

for index, row in test.iterrows():
    if row['epithet_en'] == row['prediction']:
        for epithet in epithets.keys():
            if epithet==row['epithet_en']:
                epithets[epithet]['tp']+=1
            else:
                epithets[epithet]['tn']+=1
    else:
        for epithet in epithets.keys():
            if epithet==row['epithet_en']:
                epithets[epithet]['fn']+=1
            elif epithet==row['prediction']:
                epithets[epithet]['fp']+=1
            else:
                epithets[epithet]['tn']+=1

In [70]:
## accuracy
accuracy = []
for epithet in epithets.keys():
    acc = (epithets[epithet]['tp']+epithets[epithet]['tn'])/len(Y_test)
    accuracy.append([epithet, acc])

acc = pd.DataFrame(accuracy)
acc.columns = ['epithet_en', 'accuracy']
acc.sort_values(by='accuracy', ascending=False)

Unnamed: 0,epithet_en,accuracy
0,none,1.0
3,well-made,1.0
7,well-benched,0.982143
10,with oars,0.982143
5,dark-prowed,0.964286
6,well-balanced,0.964286
8,hollowed,0.964286
4,hollow,0.946429
9,curved,0.946429
2,swift,0.821429


In [65]:
## average precision and recall
num = 0
prec_denom = 0
recall_denom = 0

for epithet in epithets.keys():
    num += epithets[epithet]['tp']

    prec_denom += epithets[epithet]['tp']
    prec_denom += epithets[epithet]['fp']

    recall_denom += epithets[epithet]['tp']
    recall_denom += epithets[epithet]['fn']

print('Precision:', num/prec_denom)
print('Recall:', num/recall_denom)

Precision: 0.6785714285714286
Recall: 0.6785714285714286
