In [1]:
PROJECT_ROOT_PATH = '../../../..'
TEMP_PATH = PROJECT_ROOT_PATH + '/tmp'
DATA_PATH  = TEMP_PATH + '/suggest/data'
MODEL_PATH = TEMP_PATH + '/suggest/model'
LOG_PATH   = TEMP_PATH + '/suggest/log'

WORDS_TRAIN_MAPPED_PATH           = DATA_PATH + '/words.train.mapped.tsv'
WORDS_TRAIN_MAPPED_IDENTICAL_PATH = DATA_PATH + '/words.train.mapped.identical.tsv'

SUGGESTS_TRAIN_MAPPED_PATH           = DATA_PATH + '/suggests.train.mapped.top10.txt'
SUGGESTS_TRAIN_MAPPED_IDENTICAL_PATH = DATA_PATH + '/suggests.train.mapped.identical.top10.txt'
SUGGESTS_TEST_PATH                   = DATA_PATH + '/suggests.test.top10.txt'

TRAIN_TIME_LOG_PATH = LOG_PATH + '/time.model.train.log'

In [3]:
import os

# Ensure folders exists in the later experiments. Otherwise may cause abnormal cases.
for path in [DATA_PATH, MODEL_PATH, LOG_PATH]:
    if not os.path.exists(path):
        os.makedirs(path)

## Data preparation

In [4]:
import data

TRAIN_DATASET = data.Dataset.read(SUGGESTS_TRAIN_MAPPED_IDENTICAL_PATH)
TEST_DATASET = data.Dataset.read(SUGGESTS_TEST_PATH)

In [12]:
for err in TRAIN_DATASET.errors:
    corr = ''
    for idx, label in enumerate(err.labels):
        if label == 1:
            corr = err.candidates[idx].name
    print('%-20s %-20s %s' % (err.name, corr, 'true' if err.name is corr else ''))

ORIOLID^                                  
famil}^              family               
whicli               which                
IdcridcE                                  
Corvince                                  
iu                   in                   
Corvidcr             Corvidae             
ga/bula              galbula              
iu                   in                   
uest                                      
Maj^                 May                  
Feildeu              Feilden              
j^ellowish           yellowish            
nsually              usually              
fii:e                fine                 
occasionall}'        occasionally         
//                                        
IMacphersou          Macpherson           
eas}'                easy                 
largel}'             largely              
b}^                  by                   
an}'                 any                  
tr3'ing              trying               
advantageou

\veek                week                 
nidificatiou         nidification         
Jul}^                July                 
steadil}^            steadily             
da}'                 day                  
diflScult            difficult            
hi                                        
ivlicoxs                                  
abetter                                   
particularl}'        particularly         
sj'stematicall}'                          
captivit}'           captivity            
;                    :                    
3'ear                year                 
b}'                  by                   
3'ou                 you                  
tlie                 the                  
thereb\'             thereby              
the}'                they                 
aud                  and                  
erythrinm            erythrinus           
FRINGILLID.'E                             
FRINGILLIN.F                              
horlulanus 

lylannikius                               
aviar}'              aviary               
thev                 they                 
the}'                they                 
the}'                they                 
iu                   in                   
SiVcrora                                  
Liolhrix                                  
aud                  and                  
Zosferops            Zosterops            
mauy                 many                 
l)ehaviour           behaviour            
aud                  and                  
aud                  and                  
bathiug              bathing              
the}/                they                 
aud                  and                  
E/iiixrizd           Emberiza             
iia                  cia                  
WO                                        
i()02                                     
Eiiibciiza           Emberiza             
aoidcs                                    
tlie       

In [13]:
for err in TEST_DATASET.errors:
    corr = ''
    for idx, label in enumerate(err.labels):
        if label == 1:
            corr = err.candidates[idx].name
    print('%-20s %-20s %s' % (err.name, corr, 'true' if err.name is corr else ''))

possibility          possibility          
niaj                 may                  
'                                         
bv                   by                   
Mr.                  Mr                   
llarting             Harting              
for                  for                  
my                   my                   
Oologj               Oology               
'                    '                    true
1S64                 1864                 
a                    a                    true
tame                 tame                 
Ravens               Ravens               
which                which                
Winterbottom         Winterbottom         
Cheltenham           Cheltenham           
built                built                
The                  The                  
fern                 fern                 
dead                 dead                 
On                   On                   
two                  two                  
tli

birds                birds                
birds                birds                
-                    -                    true
Sceboli??!                                
.                    .                    true
An                   An                   
England              England              
whilst               whilst               
Throughout                                
interbreeding        interbreeding        
occasionally         occasionally         
Carrion              Carrion              
countr}^             country              
and                  and                  
Wales                Wales                
The                  The                  
Hooded               Hooded               
thighs               thighs               
black                black                
remainder            remainder            
plumage              plumage              
ashy                 ashy                 
becoming             becoming             
qui

Jerdon               Jerdon               
observes             observes             
sparingly            sparingly            
j\Iala3-ana                               
and                  and                  
Familx-              Family               
ALAUDID.E                                 
.                                         
The                  The                  
Alauda               Alauda               
iDVCiisis            arvensis             
,                    ,                    true
LiNN                                      
.                    .                    true
FOUND                                     
during               during               
nesting              nesting              
70°                  70.5                 
,                    ,                    true
breeds               breeds               
sparingly            sparingly            
Japan                Japan                
Amoor                                     

Eugland              England              
and                  and                  
(                    (                    true
This                 This                 
Kent                 Kent                 
1902                                      
in                   in                   
December             December             
January              January              
1908                                      
Sussex               Sussex               
Faunlx-ALAUDID.E                          
.                    .                    true
The                  The                  
Lark                 Lark                 
Mcla)iOLoyypha                            
ycltontemis                               
,                                         
FoRST                                     
.                    .                    true
A                    A                    true
FLOCK                                     
visited              visited          

two                  two                  
1906                                      
Skerry               Skerry               
Vore                 Vore                 
Light                Light                
-                    -                    true
Messrs.              Messrs               
Witherby             Witherby             
Ticehurst            Ticehurst            
record               record               
Solway               Solway               
1899                                      
Aberdeen             Aberdeen             
]\Iarch              March                
20th                                      
1900                                      
Moray                Moray                
1903                 1903                 
Flannan              Flannan              
-                    -                    true
female               female               
June                 June                 
1905                                      
7th

## Suggested Candidates Evaluation

We analysis the __detected testing__ errors whether the top 10 suggested candidates include the ground truth correction.
There are 67.74% of the corrections are detected.

In [5]:
def detect_rate(dataset):
    corr = 0
    for err in dataset.errors:
        if sum(err.labels) > 0:
            corr += 1
    return corr / len(dataset.errors)
print('train detect rate:', detect_rate(TRAIN_DATASET))
print(' test detect rate:', detect_rate(TEST_DATASET))

train detect rate: 0.7880350696235173
 test detect rate: 0.884625356494685


In [112]:
for err in TEST_DATASET.errors:
    corr = [] 
    for cand, label in zip(err.candidates, err.labels):
        if label is 1:
            corr.append(cand)
    print('%-20s %s' % (err.name, '\t'.join([c.name for c in corr])))

possibility          possibility
niaj                 may
'                    
bv                   by
Mr.                  Mr.	Mr
llarting             Harting
for                  for
my                   my
Oologj               Oology
'                    '
1S64                 1864
a                    a
tame                 tame
Ravens               Ravens
which                which
Winterbottom         Winterbottom
Cheltenham           Cheltenham
built                built
The                  The
fern                 fern
dead                 dead
On                   On
two                  two
tlie                 the
nest                 nest
l)ut                 but
the                  the
l)y                  by
visitors             visitors
hatclied             hatched
Lilford              
last                 last
wliieli              which
,                    ,
liowever             however
considerable         considerable
reared               reared
1894             

Cashmere             Cashmere
N.                   
W.                   
India                India
In                   In
Rook                 Rook
prett}^              pretty
generally            generally
localities           localities
in                   in
Scotland             Scotland
though               though
rarer                rarer
Orkneys              Orkneys
The                  The
Rook                 Rook
l^are                bare
grey                 grey
warty                warty
patch                patch
extending            extending
Bill                 Bill
iris                 iris
Tlie                 The
female               female
Tlie                 The
young                young
glossy               glossy
Carrion              Carrion
bristly              bristly
feathers             feathers
it                   it
.slender             slender
bill                 bill
tlie                 the
deep                 deep
.slate               slate
- 

mottled              mottled
smok}'               smoky
grey                 grey
zone                 zone
sometimes            sometimes
streaks              streaks
aberrant             aberrant
egg                  egg
whicli               which
I                    I
lent                 lent
illustration         illustration
.                    .
XI                   XI
fig.                 fig
Bunting              Bunting
it                   it
sieuua               sienna
and                  and
macular              macular
along                along
Although             Although
Sky                  Sky
-                    -
pairs                pairs
nidification         nidification
does                 does
nests                nests
]\Iay                May
;                    ;
two                  two
being                being
eggs                 eggs
Jul                  July
'                    '
Both                 Both
descending           descending
threadin

buffish              buffish
deeper               deeper
sides                sides
spotted              spotted
breast               breast
spotted              spotted
flanks               flanks
slightly             slightly
streaked             streaked
;                    ;
bill                 bill
mandible             mandible
paler                paler
feet                 feet
fleshy               fleshy
horn                 horn
brown                brown
iris                 iris
hazel                hazel
.                    .
The                  The
crest                crest
wing                 wing
The                  The
rufescent            rufescent
blackish             blackish
sub                  sub
-                    -
tips                 tips
tlie                 the
After                After
moult                moult
Lark                 Lark
becomes              becomes
centres              centres
less                 less
Col.                 Col
I

spiders              spiders
it                   it
devours              devours
small                small
mollusca             mollusca
Crustacea            Crustacea
cast                 cast
Being                Being
both                 both
tame                 tame
beautiful            beautiful
Lark                 Lark
caged                caged
and                  and
frequentl}^          frequently
bird                 bird
Herr                 Herr
Gatke                
observes             observes
The                  The
nevertheless         nevertheless
agreeably            agreeably
Lark                 Lark
its                  its
peevish              peevish
captivit}'           captivity
,                    ,
and                  and
impetuously          impetuously
fluttering           fluttering
this                 this
prettily             prettily
marked               marked
old                  old
cage                 cage
My                   My
flies  

## Training

In [6]:
# Time logging utils

import os

def remove_log_file(path=TRAIN_TIME_LOG_PATH):
    os.remove(path)

def read_train_time(path=TRAIN_TIME_LOG_PATH):
    try:
        print(open(TRAIN_TIME_LOG_PATH, 'r').read())
        return dict(tuple(l.strip().split('\t')) for l in open(TRAIN_TIME_LOG_PATH, 'r'))
    except FileNotFoundError:
        return dict()

def update_train_time(name, time, path=TRAIN_TIME_LOG_PATH, read_func=read_train_time):
    timetable = read_func(path)
    timetable[name] = time
    with open(TRAIN_TIME_LOG_PATH, 'w') as log:
        log.write('\n'.join('%s\t%s' % (k, v) for k, v in timetable.items()))
    return timetable

In [7]:
import model

MODELS = [ # (name, model, balanced, customized grid)
    ('RF', model.RandomForestModel, False, None)
]

In [10]:
import time
import os

def get_pkl_path(model):
    """ Get the pathname to the serialized model file. """
    return '%s/%s.pkl' % (MODEL_PATH, model[0])

def train(models):
    for model in models:
        name, md, balanced, search_grid = model
        
        # Collect parameters
        kwargs = {'weighted': balanced, 'pkl_path': get_pkl_path(model)}
        if search_grid: kwargs['param_grid'] = search_grid
            
        # Train model
        if not os.path.exists(get_pkl_path(model)):
            t_str = time.time()
            md(train_dataset, **kwargs)
            t_end = time.time()
            update_train_time(name, '%.2f' % (t_end - t_str))
    print('\n'.join('%s'))
    
def test(models):
    for model in models:
        name, md, balanced, search_grid = model
        lm = md(TRAIN_DATASET, pkl_path=get_pkl_path(model))
        lm.predict(TEST_DATASET)
        print('1: {}\n3: {}\n5: {}\n10: {}\nA: {}\n'
            .format(TEST_DATASET.precision_at(1),
            TEST_DATASET.precision_at(1),
            TEST_DATASET.precision_at(3),
            TEST_DATASET.precision_at(5),
            TEST_DATASET.precision_at(10),
            TEST_DATASET.precision_at()))
        
def report(model):
    name, md, balanced, search_grid = model
    lm = md(TRAIN_DATASET, pkl_path=get_pkl_path(model))
    lm.predict(TEST_DATASET)
    for e in TEST_DATASET.errors:
        print('%15s %8d %2d %s' % (e.name, e.position, e.rank, ''))
        for c in e.candidates:
            print('\t%15s %.4f' % (c.name, c.confidence))
        print()

In [118]:
train(MODELS)

%
s


In [119]:
test(MODELS)

1: 0.14233860513352348
3: 0.14233860513352348
5: 0.311122634171636
10: 0.38656987295825773
A: 0.49935182784547577



In [11]:
report(MODELS[0])

    possibility   407374 46 
	    development 0.3234
	    possibilist 0.4011
	            the 0.2699
	      possibili 0.4163
	        country 0.0000
	        history 0.2964
	          <UNK> 0.3239
	           case 0.3103
	           time 0.2615
	           last 0.0270
	             to 0.0029
	              > 0.0029
	    application 0.3401
	          issue 0.3401
	              " 0.0029
	       Republic 0.3821
	           </S> 0.0000
	       question 0.3401
	       possibly 0.0000
	           same 0.0000
	             of 0.0029
	    possibility 0.2836
	  impossibility 0.4103
	             -- 0.0029
	        results 0.3851
	           part 0.1436
	         number 0.3244
	           this 0.0000
	          power 0.3074
	     possibilia 0.4081
	           list 0.4490
	           name 0.0000
	    possibilism 0.3970
	  Impossibility 0.4117
	  unpossibility 0.4117
	     posibilito 0.4696
	   possibilitar 0.4153
	         United 0.0000
	     possibilis 0.4081
	           your 0.3079
	         p

OverflowError: cannot convert float infinity to integer

In [69]:
import time

t1 = time.time()
time.sleep(1)
t2 = time.time()
print(t1)
print(t2)
print(t2 - t1)

1499870413.2445834
1499870414.24525
1.000666618347168


In [29]:
import collections

d = collections.defaultdict()
d['a'] = 'b'
d['c'] = 'd'
print(d)

defaultdict(None, {'a': 'b', 'c': 'd'})


In [33]:
d = {}
d['a'] = 'b'
d['c'] = 'd'
print(d)

'\n'.join('%s\t%s' % (k, v) for k, v in d.items())

{'a': 'b', 'c': 'd'}


'a\tb\nc\td'

In [22]:
tuple('1 2 3'.split())

('1', '2', '3')