## Resources

In [142]:
PROJECT_ROOT_PATH = '../../../..'
TEMP_PATH = PROJECT_ROOT_PATH + '/tmp'
DATA_PATH = TEMP_PATH + '/detect/data'

WORDS_TRAIN_PATH = DATA_PATH + '/words.train.tsv'
WORDS_TEST_PATH = DATA_PATH + '/words.test.tsv'
LABELS_TRAIN_PATH = DATA_PATH + '/labels.train.tsv'
LABELS_TEST_PATH = DATA_PATH + '/labels.test.tsv'

import glob
FEATURES_TEST_PATH = sorted(glob.glob(DATA_PATH + "/*Feature*"))
FEATURES_TEST_PATH

['../../../../tmp/detect/data/ApproximateContextCoherenceFeature$bigram.tsv',
 '../../../../tmp/detect/data/ApproximateContextCoherenceFeature$fivegram.tsv',
 '../../../../tmp/detect/data/ApproximateContextCoherenceFeature$fourgram.tsv',
 '../../../../tmp/detect/data/ApproximateContextCoherenceFeature$trigram.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$123.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$124.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$125.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$126.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$127.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$32.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$33.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$34.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$35.tsv',
 '../../../../tmp/detect/data/CharacterExistenceFeature$36.tsv',
 '../../../../tmp/detect/dat

## Data

In [138]:
import csv
import pandas as pd

def read_words(path):
    col_names = ["w-4", "w-3", "w-2", "w-1", "w", "w+1", "w+2", "w+3", "pos"]
    return pd.read_table(open(path, 'r'),
                         header=None,
                         quoting=csv.QUOTE_NONE,
                         names=col_names,
                        )

def read_vals(path, name=None):
    return pd.read_table(open(path, 'r'),
                         header=None,
                         names=[name],
                        )

def read_labels(path):
    return read_vals(path, 'label')

def read_feat_vals(path):
    name = path[path[:-4].rfind('.') + 1:-4]
    return read_vals(path, name)

def read_feats(paths):
    feats_vals = [read_feat_vals(p) for p in paths]
    return pd.concat(feats_vals, axis=1)

WORDS_TRAIN = read_words(WORDS_TRAIN_PATH)
WORDS_TEST  = read_words(WORDS_TEST_PATH)
LABELS_TRAIN = read_labels(LABELS_TRAIN_PATH)
LABELS_TEST  = read_labels(LABELS_TEST_PATH)

feats = read_feats(FEATURES_TEST_PATH)
FEATS_TRAIN = feats[:WORDS_TRAIN.shape[0]]
FEATS_TEST = feats[WORDS_TRAIN.shape[0]:]
feats = None

## Training

In [155]:
import os
import shutil
from sklearn import tree
from sklearn.externals import joblib
import sys

FORCE_RETRAIN = True
MODEL_PATH = TEMP_PATH + '/detect/model'
MODELS = [
    tree.DecisionTreeClassifier()
]

def get_name(clf):
    return type(clf).__name__

def get_pkl_path(clf):
    return MODEL_PATH + '/' + '%s.pkl' % type(clf).__name__

if FORCE_RETRAIN:
    shutil.rmtree(MODEL_PATH)
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
for clf in MODELS:
    sys.stdout.write(get_name(clf) + '... ')
    pkl_path = get_pkl_path(clf)
    if not os.path.exists(pkl_path):
        clf.fit(FEATS_TRAIN, LABELS_TRAIN)
        joblib.dump(clf, pkl_path, compress=9)
        print('TRAINED (%s)' % os.path.basename(pkl_path))
    else:
        print('SKIP')

DecisionTreeClassifier... TRAINED (DecisionTreeClassifier.pkl)


In [156]:
from sklearn import metrics
from sklearn.externals import joblib

for clf in MODELS:
    clf = joblib.load(get_pkl_path(clf))
    preds = clf.predict(FEATS_TEST)
    acc = metrics.accuracy_score(LABELS_TEST, preds)
    print('%-30s %.6f' % (get_name(clf), acc))

DecisionTreeClassifier         0.980433
