# CFT2018 contest - Classification & Fixing

In [None]:
import pandas as pd
import numpy as np

Загрузим данные.

In [None]:
data_dir = 'data'
train = pd.read_csv(data_dir+'/train.csv')
test = pd.read_csv(data_dir+'/test.csv')
print(train.shape)
print(test.shape)
print(train.target.value_counts())
train.head()

In [None]:
test.head()

In [None]:
import re

def layout(fio):
    fio = re.sub('[^А-ЯA-Z Ё]','',fio.upper())
    if len(re.sub('[А-Я Ё]','',fio)) == 0:
        return 'RU'
    elif len(re.sub('[A-Z ]','',fio)) == 0:
        return 'EN'
    else:
        return 'RU-EN'
    
train['fullname_lay'] = train['fullname'].apply(layout)
print(train.shape)
print(train.fullname_lay.value_counts())
train.head()

##### Все ФИО, которые написаны в двух раскладках - полностью некорректны

In [None]:
train[train.fullname_lay=='RU-EN'].target.value_counts()

In [None]:
train_ru = train[np.multiply(train.id<=1592433,train.fullname_lay=='RU')].reset_index(drop=True)
valid_ru = train[np.multiply(train.id>1592433,train.fullname_lay=='RU')].reset_index(drop=True)
print(train_ru.shape,valid_ru.shape)

train_en = train[np.multiply(train.id<=1596647,train.fullname_lay=='EN')].reset_index(drop=True)
valid_en = train[np.multiply(train.id>1596647,train.fullname_lay=='EN')].reset_index(drop=True)
print(train_en.shape,valid_en.shape)

## Построение классификатора 012 на основе предсказаний nn и страны

In [None]:
print(valid_ru.target.value_counts())
print(valid_en.target.value_counts())

#### RU

In [None]:
import argparse
import datetime

import os
import re

import numpy as np
import pandas as pd

import symspellpy

import torch
import torch.optim as optim
from torchlite.torch.learner import Learner
from torchlite.torch.learner.cores import ClassifierCore
from torchlite.torch.metrics import Metric
from torchlite.torch.train_callbacks import TensorboardVisualizerCallback, ModelSaverCallback, ReduceLROnPlateau
torch.set_num_threads(1)

from nn_correct.loader import FIOLoader
from nn_correct.model import CorrectorModel
from nn_correct.vectorizer import Vectorizer, ru_idx, en_idx

from collections import defaultdict, Counter

from multiprocessing import Pool


class restore_fio_by_nn_ln:
    def __init__(self,model_path,ln_idx):
        vect = Vectorizer(ln_idx)
        vect_diff = Vectorizer(Vectorizer.make_diff_alphabet(ln_idx))
        ref_diff_alphabet = dict((value,key) for key, value in vect_diff.alphabet.items())

        model = CorrectorModel(
            embedding_size=vect.length,
            conv_sizes=[600, 300, 300, 300],
            out_size=vect_diff.length,
            dropout=0.1,
            window=5,
            lstm_layers=2,
            lstm_size=300
        )

        ModelSaverCallback.restore_model_from_file(model, model_path, load_with_cpu=True)
        model = model.eval()
        
        self.model=model
        self.vect=vect
        self.ref_diff_alphabet=ref_diff_alphabet
        
    def __call__(self,fio):
        def restore(fullname, target):
                fullname = '#' + fullname + "#"
                res = []
                for src, tg in zip(fullname, target):
                    if tg == '':
                        res.append(src)
                    elif tg == '--':
                        pass
                    elif len(tg) == 2 and tg[0] == '+':
                        res.append(tg[1])
                        res.append(src)
                    else:
                        res.append(tg)
                res = ''.join(res)
                return res.strip('#')

        def restore_fio(just_fio):
            s_batch, batch_lengths = self.vect.vect_batch(['#'+just_fio+'#'])
            s_batch_torch =torch.from_numpy(s_batch)
            prediction = self.model(s_batch_torch, batch_lengths)
            diff_idxs = torch.argmax(prediction, dim=1)[0].cpu().numpy()
            
            prediction_maxpool = prediction.max(dim=2)[0][0].cpu().detach().numpy()
            prediction_meanpool = prediction.mean(dim=2)[0].cpu().detach().numpy()
            prediction_mmp = np.hstack((prediction_maxpool,prediction_meanpool))

            diff = []
            for diff_idx in diff_idxs:
                diff.append(self.ref_diff_alphabet[diff_idx])

            return restore(just_fio,diff), prediction_mmp
        
        return restore_fio(fio)

In [None]:
%%time

rfbn_ru = restore_fio_by_nn_ln('path to ru model', ru_idx)
os.environ['OMP_NUM_THREADS'] = '1'

with Pool(5) as pool:
    nn_fix_and_prob = pool.map(rfbn_ru, list(valid_ru.fullname))

valid_ru['fullname_fix_nn'] = [row[0] for row in nn_fix_and_prob]
valid_ru_prob = np.array(tuple(row[1] for row in nn_fix_and_prob))


import pickle
from sklearn.linear_model import LogisticRegression
from collections import Counter


countries = list(train_ru.country) + list(train_en.country)
dict_countries = Counter(countries)
dict_countries = sorted(dict_countries.items(), key=lambda kv: kv[1], reverse=True)
countries_pop = [pair[0] for pair in dict_countries[:40]]

def ohe_country(countries,countries_pop):
    return np.array([[1 if country==one else 0 for country in countries] for one in countries_pop]).T


cl_ru = LogisticRegression(penalty='l2',C=1.25,random_state=42,n_jobs=20)
cl_ru.fit(np.hstack((valid_ru_prob,ohe_country(valid_ru.country,countries_pop))),valid_ru.target)
pickle.dump(cl_ru, open(data_dir+'/predict_012_ru.sav', 'wb'))

Классификатор 012 на вероятностях из nn со страной

In [None]:
from sklearn.model_selection import cross_val_score

cl_ru = LogisticRegression(penalty='l2',C=1.25,random_state=42,n_jobs=20)
cross_val_score(cl_ru,np.hstack((valid_ru_prob,ohe_country(valid_ru.country,countries_pop))),valid_ru.target,cv=3,scoring='f1_macro').mean()

#### EN

In [None]:
%%time

rfbn_en = restore_fio_by_nn_ln('path to en model',en_idx)
os.environ['OMP_NUM_THREADS'] = '1'

with Pool(5) as pool:
    nn_fix_and_prob = pool.map(rfbn_en, list(valid_en.fullname))

valid_en['fullname_fix_nn'] = [row[0] for row in nn_fix_and_prob]
valid_en_prob = np.array(tuple(row[1] for row in nn_fix_and_prob))


cl_en = LogisticRegression(penalty='l2',C=1.25,random_state=42,n_jobs=20)
cl_en.fit(np.hstack((valid_en_prob,ohe_country(valid_en.country,countries_pop))),valid_en.target)
pickle.dump(cl_en, open(data_dir+'/predict_012_en.sav', 'wb'))

## Построение классификатора nn-dict

In [None]:
valid_ru.head()

In [None]:
train.head()

In [None]:
from collections import Counter
from collections import defaultdict


#список неправильных имен
name_err = []
for itr in range(len(train)):
    if train.target[itr] == 1:
        fullname = train.fullname[itr].split(' ')
        fullname_true = train.fullname_true[itr].split(' ')
        if len(fullname) == len(fullname_true):
            for jtr in range(len(fullname)):
                if fullname[jtr] != fullname_true[jtr]:
                    name_err.append(fullname[jtr]) 
    elif train.target[itr] == 2:
        name_err +=  train.fullname[itr].split(' ')


#частотные словари трейна и теста
dicts_train = [name for person in train.loc[train.target==0,'fullname'] for name in person.split(' ')] + [name for person in train.loc[train.target==1,'fullname_true'] for name in person.split(' ')]
name_freq_train = Counter(dicts_train)

dicts_test = [name for person in test.fullname for name in person.split(' ')]
name_freq_test = Counter(dicts_test)
name_freq_test = {name:freq for name, freq in name_freq_test.items() if freq > 1}


#объединение словарей
name_freq = defaultdict(int)

for name, freq in name_freq_train.items():
    name_freq[name] += freq

for name in name_err:
    if name in name_freq_test:
        del name_freq_test[name]
    
for name, freq in name_freq_test.items():
    name_freq[name] += freq

In [None]:
import symspellpy
symspell = symspellpy.SymSpell()

with open(data_dir+'/dictionary.txt', 'w') as f:
    for name, freq in name_freq.items():
        f.write('{} {}\n'.format(name, freq))
        
symspell.load_dictionary(data_dir+'/dictionary.txt', term_index=0, count_index=1)

In [None]:
def correct(s):
    def correct_word(w):
        tmp = symspell.lookup(w, symspellpy.Verbosity.CLOSEST)
        if len(tmp):
            return tmp[0].term.upper()
        else:
            return w

    return ' '.join([correct_word(word) for word in s.split(' ')])

#### RU

In [None]:
valid_ru['fullname_fix_dict'] = None

valid_ru.loc[valid_ru.target==1,'fullname_fix_dict'] = valid_ru.loc[valid_ru.target==1,'fullname'].apply(correct)
valid_ru.head()

In [None]:
acc_nn = sum(np.multiply(valid_ru.fullname_true==valid_ru.fullname_fix_nn,valid_ru.target==1))/sum(valid_ru.target==1)
acc_dict = sum(np.multiply(valid_ru.fullname_true==valid_ru.fullname_fix_dict,valid_ru.target==1))/sum(valid_ru.target==1)
acc_nn_and_dict = sum(np.multiply(np.multiply(valid_ru.fullname_true==valid_ru.fullname_fix_nn,valid_ru.fullname_true==valid_ru.fullname_fix_dict),valid_ru.target==1))/sum(valid_ru.target==1)

print('nn', acc_nn)
print('dict', acc_dict)
print('nn and dict', acc_nn_and_dict)
print('nn or dict', acc_nn + acc_dict - acc_nn_and_dict)

In [None]:
def freq_feat(fullname, fullname_fix_nn, fullname_fix_dict):
    fullname = fullname.split()
    fullname_fix_nn = fullname_fix_nn.split()
    fullname_fix_dict = fullname_fix_dict.split()
    
    len_fullname = len(fullname)
    len_fullname_fix_nn = len(fullname_fix_nn)
    len_fullname_fix_dict = len(fullname_fix_dict)
    
    diff_fullname = set()
    diff_fullname_fix_nn = set()
    diff_fullname_fix_dict = set()
    
    match = set()
    
    if len_fullname == len_fullname_fix_nn == len_fullname_fix_dict:
        for i in range(len_fullname):
            if fullname[i] != fullname_fix_nn[i]:
                diff_fullname.add(fullname[i])
                diff_fullname_fix_nn.add(fullname_fix_nn[i])
            else:
                match.add(fullname[i])
                
            if fullname[i] != fullname_fix_dict[i]:
                diff_fullname.add(fullname[i])
                diff_fullname_fix_dict.add(fullname_fix_dict[i])
            else:
                match.add(fullname[i])
    else:
        for i in range(len_fullname):
            diff_fullname.add(fullname[i])
            match.add(fullname[i])
        for i in range(len_fullname_fix_nn):
            diff_fullname_fix_nn.add(fullname_fix_nn[i])
            match.add(fullname_fix_nn[i])
        for i in range(len_fullname_fix_dict):
            diff_fullname_fix_dict.add(fullname_fix_dict[i])
            match.add(fullname_fix_dict[i])
            
#     print(diff_fullname,diff_fullname_fix_nn,diff_fullname_fix_dict,match)
    
    freq_fullname = [name_freq[name] for name in diff_fullname]
    freq_fullname_fix_nn = [name_freq[name] for name in diff_fullname_fix_nn]
    freq_fullname_fix_dict = [name_freq[name] for name in diff_fullname_fix_dict]
    freq_match = [name_freq[name] for name in match]
    
    f_orig = np.mean(freq_fullname if len(freq_fullname) > 0 else 0)
    f_nn = np.mean(freq_fullname_fix_nn if len(freq_fullname_fix_nn) > 0 else 0)
    f_dict = np.mean(freq_fullname_fix_dict if len(freq_fullname_fix_dict) > 0 else 0)
    
    f_other_max = np.max(freq_match if len(freq_match) > 0 else 0)
    f_other_min = np.min(freq_match if len(freq_match) > 0 else 0)
    f_other_avg = np.mean(freq_match if len(freq_match) > 0 else 0)
    
    return float(f_orig), float(f_nn), float(f_dict), float(f_other_max), float(f_other_min), float(f_other_avg)

In [None]:
from multiprocessing import Pool

In [None]:
%%time

valid_ru_1 = valid_ru[valid_ru.target==1].reset_index(drop=True)

def freq_feat_i(i):
    return freq_feat(valid_ru_1.fullname[i], valid_ru_1.fullname_fix_nn[i], valid_ru_1.fullname_fix_dict[i])

with Pool(5) as pool:
    freq_feat_ru = pool.map(freq_feat_i, range(len(valid_ru_1)))
    
freq_feat_ru = np.array(freq_feat_ru)

In [None]:
valid_ru_1['matching'] = None

valid_ru_1.loc[np.multiply(valid_ru_1.fullname_true==valid_ru_1.fullname_fix_nn, valid_ru_1.fullname_true==valid_ru_1.fullname_fix_dict),'matching'] = 0
valid_ru_1.loc[np.multiply(valid_ru_1.fullname_true==valid_ru_1.fullname_fix_nn, valid_ru_1.fullname_true!=valid_ru_1.fullname_fix_dict),'matching'] = 1
valid_ru_1.loc[np.multiply(valid_ru_1.fullname_true!=valid_ru_1.fullname_fix_nn, valid_ru_1.fullname_true==valid_ru_1.fullname_fix_dict),'matching'] = 2
valid_ru_1.loc[np.multiply(valid_ru_1.fullname_true!=valid_ru_1.fullname_fix_nn, valid_ru_1.fullname_true!=valid_ru_1.fullname_fix_dict),'matching'] = 3

valid_ru_1.head()

In [None]:
valid_ru_1.matching.value_counts()

In [None]:
from xgboost import XGBClassifier

cl_nndict_ru = XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=800, n_jobs=20,random_state=42)
cl_nndict_ru.fit(freq_feat_ru, valid_ru_1.matching)
pickle.dump(cl_nndict_ru, open(data_dir+'/predict_nn_dict_ru.sav', 'wb'))

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=4, min_child_weight=1, missing=None, n_estimators=800,
        n_jobs=20, nthread=None, objective='binary:logistic',
        random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=None, silent=True, subsample=1), 0.9409484074334511)

#### EN

In [None]:
valid_en.head()

In [None]:
valid_en['fullname_fix_dict'] = None
valid_en.loc[valid_en.target==1,'fullname_fix_dict'] = valid_en.loc[valid_en.target==1,'fullname'].apply(correct)


acc_nn = sum(np.multiply(valid_en.fullname_true==valid_en.fullname_fix_nn,valid_en.target==1))/sum(valid_en.target==1)
acc_dict = sum(np.multiply(valid_en.fullname_true==valid_en.fullname_fix_dict,valid_en.target==1))/sum(valid_en.target==1)
acc_nn_and_dict = sum(np.multiply(np.multiply(valid_en.fullname_true==valid_en.fullname_fix_nn,valid_en.fullname_true==valid_en.fullname_fix_dict),valid_en.target==1))/sum(valid_en.target==1)
print('nn', acc_nn)
print('dict', acc_dict)
print('nn and dict', acc_nn_and_dict)
print('nn or dict', acc_nn + acc_dict - acc_nn_and_dict)


valid_en_1 = valid_en[valid_en.target==1].reset_index(drop=True)
def freq_feat_i(i):
    return freq_feat(valid_en_1.fullname[i], valid_en_1.fullname_fix_nn[i], valid_en_1.fullname_fix_dict[i])
with Pool(5) as pool:
    freq_feat_en = pool.map(freq_feat_i, range(len(valid_en_1)))
freq_feat_en = np.array(freq_feat_en)


valid_en_1['matching'] = None
valid_en_1.loc[np.multiply(valid_en_1.fullname_true==valid_en_1.fullname_fix_nn, valid_en_1.fullname_true==valid_en_1.fullname_fix_dict),'matching'] = 0
valid_en_1.loc[np.multiply(valid_en_1.fullname_true==valid_en_1.fullname_fix_nn, valid_en_1.fullname_true!=valid_en_1.fullname_fix_dict),'matching'] = 1
valid_en_1.loc[np.multiply(valid_en_1.fullname_true!=valid_en_1.fullname_fix_nn, valid_en_1.fullname_true==valid_en_1.fullname_fix_dict),'matching'] = 2
valid_en_1.loc[np.multiply(valid_en_1.fullname_true!=valid_en_1.fullname_fix_nn, valid_en_1.fullname_true!=valid_en_1.fullname_fix_dict),'matching'] = 3
print(valid_en_1.matching.value_counts())


cl_nndict_en = XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=800, n_jobs=20,random_state=42)
print(cross_val_score(cl_nndict_en,freq_feat_en, valid_en_1.matching,scoring='accuracy',cv=3).mean())
cl_nndict_en.fit(freq_feat_en, valid_en_1.matching)
pickle.dump(cl_nndict_en, open(data_dir+'/predict_nn_dict_en.sav', 'wb'))

## Применяем классификаторы 012 для теста

In [None]:
print(test.shape)
test.head()

In [None]:
test['fullname_lay'] = test['fullname'].apply(layout)
print(test.fullname_lay.value_counts())
print(test.shape)
test.head()

Там где ФИО написаны в двух раскладках - полностью некорректные

In [None]:
%%time
test['target'] = None
test.loc[test['fullname_lay']=='RU-EN','target'] = 2
test['fullname_fix_nn'] = None
    
    

    
os.environ['OMP_NUM_THREADS'] = '1'
with Pool(5) as pool:
    nn_fix_and_prob = pool.map(rfbn_ru, list(test.loc[test['fullname_lay']=='RU','fullname']))

test.loc[test['fullname_lay']=='RU','fullname_fix_nn'] = [row[0] for row in nn_fix_and_prob]
test_ru_prob = np.array(tuple(row[1] for row in nn_fix_and_prob))

test.loc[test['fullname_lay']=='RU','target'] = cl_ru.predict(np.hstack((test_ru_prob,ohe_country(list(test.loc[test['fullname_lay']=='RU','country']),countries_pop))))




os.environ['OMP_NUM_THREADS'] = '1'
with Pool(5) as pool:
    nn_fix_and_prob = pool.map(rfbn_en, list(test.loc[test['fullname_lay']=='EN','fullname']))

test.loc[test['fullname_lay']=='EN','fullname_fix_nn'] = [row[0] for row in nn_fix_and_prob]
test_en_prob = np.array(tuple(row[1] for row in nn_fix_and_prob))

test.loc[test['fullname_lay']=='EN','target'] = cl_en.predict(np.hstack((test_en_prob,ohe_country(list(test.loc[test['fullname_lay']=='EN','country']),countries_pop))))




print(test.shape)
print(test.target.value_counts())

In [None]:
test.head()

In [None]:
print(sum(np.multiply(test.fullname==test.fullname_fix_nn,test.target==1)))
print(sum(np.multiply(test.fullname!=test.fullname_fix_nn,test.target==0)))

## Применям классификаторы nn-dict для теста

In [None]:
test.head()

In [None]:
test_bck = test.copy()

In [None]:
test['fullname_fix_dict'] = None
with Pool(5) as pool:
    test_fix_dict = pool.map(correct,list(test.loc[test.target==1,'fullname']))
test.loc[test.target==1,'fullname_fix_dict'] = test_fix_dict
print(1)


test_ru_1 = test[np.multiply(test.target==1,test.fullname_lay=='RU')].reset_index(drop=True)
def freq_feat_i(i):
    return freq_feat(test_ru_1.fullname[i], test_ru_1.fullname_fix_nn[i], test_ru_1.fullname_fix_dict[i])
with Pool(5) as pool:
    freq_feat_test_ru = pool.map(freq_feat_i, range(len(test_ru_1)))
freq_feat_test_ru = np.array(freq_feat_test_ru)
print(2)


test['matching'] = None
test.loc[np.multiply(test.target==1,test.fullname_lay=='RU'),'matching'] = cl_nndict_ru.predict(freq_feat_test_ru)
print(test.loc[np.multiply(test.target==1,test.fullname_lay=='RU'),'matching'].value_counts())
print(3)


test_en_1 = test[np.multiply(test.target==1,test.fullname_lay=='EN')].reset_index(drop=True)
def freq_feat_i(i):
    return freq_feat(test_en_1.fullname[i], test_en_1.fullname_fix_nn[i], test_en_1.fullname_fix_dict[i])
with Pool(5) as pool:
    freq_feat_test_en = pool.map(freq_feat_i, range(len(test_en_1)))
freq_feat_test_en = np.array(freq_feat_test_en)
print(4)


test.loc[np.multiply(test.target==1,test.fullname_lay=='EN'),'matching'] = cl_nndict_en.predict(freq_feat_test_en)
print(test.loc[np.multiply(test.target==1,test.fullname_lay=='EN'),'matching'].value_counts())
print(5)

In [None]:
test.head()

In [None]:
test[test.target==1]['matching'].value_counts()

In [None]:
test['fullname_true'] = None
test.loc[np.multiply(test.target==1,test.matching!=2),'fullname_true'] = test.loc[np.multiply(test.target==1,test.matching!=2),'fullname_fix_nn']
test.loc[np.multiply(test.target==1,test.matching==2),'fullname_true'] = test.loc[np.multiply(test.target==1,test.matching==2),'fullname_fix_dict']

pd.DataFrame.to_csv(test[['id','target','fullname_true']],data_dir+'/sub_nn_wcountry_wdict_target_fn.csv',index=None)