In [71]:
PROJECT_ROOT_PATH = '../../../..'
TEMP_PATH = PROJECT_ROOT_PATH + '/tmp'
DATA_PATH  = TEMP_PATH + '/suggest/data'
MODEL_PATH = TEMP_PATH + '/suggest/model'
LOG_PATH   = TEMP_PATH + '/suggest/log'

SUGGESTS_TRAIN_MAPPED_PATH           = DATA_PATH + '/suggests.train.mapped.top10.txt'
SUGGESTS_TRAIN_MAPPED_IDENTICAL_PATH = DATA_PATH + '/suggests.train.mapped.identical.top10.txt'
SUGGESTS_TEST_PATH                   = DATA_PATH + '/suggests.test.top10.txt'

TRAIN_TIME_LOG_PATH = LOG_PATH + '/time.model.train.log'

In [72]:
for path in [DATA_PATH, MODEL_PATH, LOG_PATH]:
    if not os.path.exists(path):
        os.makedirs(path)

## Data preparation

In [88]:
import data

TRAIN_DATASET = data.Dataset.read(SUGGESTS_TRAIN_MAPPED_IDENTICAL_PATH)
TEST_DATASET = data.Dataset.read(SUGGESTS_TEST_PATH)

In [3]:
for err in train_dataset.errors:
    corr = ''
    for idx, label in enumerate(err.labels):
        if label == 1:
            corr = err.candidates[idx].name
    print('%-20s %-20s %s' % (err.name, corr, 'true' if err.name is corr else ''))

ORIOLID^                                  
famil}^              family               
whicli               which                
IdcridcE                                  
Corvince                                  
iu                   in                   
Corvidcr             Corvidae             
ga/bula              galbula              
iu                   in                   
uest                                      
Maj^                 May                  
Feildeu              Feilden              
j^ellowish           yellowish            
nsually              usually              
fii:e                fine                 
occasionall}'        occasionally         
//                                        
IMacphersou          Macpherson           
eas}'                easy                 
largel}'             largely              
b}^                  by                   
an}'                 any                  
tr3'ing              trying               
advantageou

## Suggested Candidates Evaluation

In [None]:
For err in TEST_DATSET.:

## Training

In [75]:
# Time logging utils

import os

def remove_log_file(path=TRAIN_TIME_LOG_PATH):
    os.remove(path)

def read_train_time(path=TRAIN_TIME_LOG_PATH):
    try:
        print(open(TRAIN_TIME_LOG_PATH, 'r').read())
        return dict(tuple(l.strip().split('\t')) for l in open(TRAIN_TIME_LOG_PATH, 'r'))
    except FileNotFoundError:
        return dict()

def update_train_time(name, time, path=TRAIN_TIME_LOG_PATH, read_func=read_train_time):
    timetable = read_func(path)
    timetable[name] = time
    with open(TRAIN_TIME_LOG_PATH, 'w') as log:
        log.write('\n'.join('%s\t%s' % (k, v) for k, v in timetable.items()))
    return timetable

In [4]:
import model

MODELS = [ # (name, model, balanced, customized grid)
    ('RF', model.RandomForestModel, False, None)
]

In [86]:
import time
import os

def get_pkl_path(model):
    """ Get the pathname to the serialized model file. """
    return '%s/%s.pkl' % (MODEL_PATH, model[0])

def train(models):
    for model in models:
        name, md, balanced, search_grid = model
        
        # Collect parameters
        kwargs = {'weighted': balanced, 'pkl_path': get_pkl_path(model)}
        if search_grid: kwargs['param_grid'] = search_grid
            
        # Train model
        if not os.path.exists(get_pkl_path(model)):
            t_str = time.time()
            md(train_dataset, **kwargs)
            t_end = time.time()
            update_train_time(name, '%.2f' % (t_end - t_str))
    print('\n'.join('%s'))
    
def test(models):
    for model in models:
        name, md, balanced, search_grid = model
        lm = md(TRAIN_DATASET, pkl_path=get_pkl_path(model))
        lm.predict(TEST_DATASET)
        print('1: {}\n3: {}\n5: {}\n10: {}\nA: {}\n'
            .format(TEST_DATASET.precision_at(1),
            TEST_DATASET.precision_at(1),
            TEST_DATASET.precision_at(3),
            TEST_DATASET.precision_at(5),
            TEST_DATASET.precision_at(10),
            TEST_DATASET.precision_at()))

In [76]:
train(MODELS)

In [87]:
test(MODELS)

1: 0.11088709677419355
3: 0.11088709677419355
5: 0.22983870967741934
10: 0.28830645161290325
A: 0.3407258064516129



In [69]:
import time

t1 = time.time()
time.sleep(1)
t2 = time.time()
print(t1)
print(t2)
print(t2 - t1)

1499870413.2445834
1499870414.24525
1.000666618347168


In [29]:
import collections

d = collections.defaultdict()
d['a'] = 'b'
d['c'] = 'd'
print(d)

defaultdict(None, {'a': 'b', 'c': 'd'})


In [33]:
d = {}
d['a'] = 'b'
d['c'] = 'd'
print(d)

'\n'.join('%s\t%s' % (k, v) for k, v in d.items())

{'a': 'b', 'c': 'd'}


'a\tb\nc\td'

In [22]:
tuple('1 2 3'.split())

('1', '2', '3')