# CFT2018 contest - Preparing Data

In [None]:
import pandas as pd
import numpy as np

#### Загрузим данные

In [None]:
data_dir = 'data'

In [None]:
train = pd.read_csv(data_dir+'/train.csv')
test = pd.read_csv(data_dir+'/test.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train['target'].value_counts()

#### RU & EN

In [None]:
train_01 = train[np.logical_or(train.target==0,train.target==1)].reset_index(drop=True)
print(train_01.shape)
train_01.head()

#### Проверим, что нет косяков с раскладкой

In [None]:
import re

def layout(fio):
    fio = re.sub('[^А-ЯA-Z Ё]','',fio.upper())
    if len(re.sub('[А-Я Ё]','',fio)) == 0:
        return 'RU'
    elif len(re.sub('[A-Z ]','',fio)) == 0:
        return 'EN'
    else:
        return 'RU-EN'

In [None]:
train_01_lay = train_01
train_01_lay['fullname_lay'] = train_01['fullname'].apply(layout)
train_01_lay['fullname_true_lay'] = np.nan
train_01_lay.loc[~pd.isnull(train_01_lay['fullname_true']),'fullname_true_lay'] = train_01.loc[~pd.isnull(train_01_lay['fullname_true']),'fullname_true'].apply(layout)
print(train_01_lay.shape)
train_01_lay.head()

In [None]:
print(train_01_lay.fullname_lay.value_counts())
print(train_01_lay.fullname_true_lay.value_counts())

In [None]:
train_01_lay[np.logical_and(train_01_lay.fullname_lay=='RU',train_01_lay.fullname_true_lay=='RU-EN')]

#### RU

In [None]:
train_01_ru = train_01_lay[np.multiply(train_01_lay.fullname_lay=='RU',np.logical_or(train_01_lay.fullname_true_lay=='RU',pd.isnull(train_01_lay.fullname_true_lay)))]
train_01_ru_train = train_01_ru[:int(0.8*len(train_01_ru))]
train_01_ru_valid = train_01_ru[int(0.8*len(train_01_ru)):]

print(train_01_ru_train.shape)
train_01_ru_train.head()

In [None]:
import Levenshtein
from collections import Counter, defaultdict

def encode(fullname, fullname_true):
    fullname = '#' + fullname + "#"
    fullname_true = '#' + fullname_true + "#"
    target = [''] * len(fullname)
    edit_opts = Levenshtein.editops(fullname, fullname_true)
    edit_opts = sorted(edit_opts, key=lambda x: (x[0], -x[1]), reverse=True)
    for op, src, dst in edit_opts:
        if op == 'delete':
            target[src] = '--'
        if op == 'replace':
            target[src] = fullname_true[dst]
        if op == 'insert':
            target[src] = '+' + fullname_true[dst]
    return target

def restore(fullname, target):
    fullname = '#' + fullname + "#"
    res = []
    for src, tg in zip(fullname, target):
        if tg == '':
            res.append(src)
        elif tg == '--':
            pass
        elif len(tg) ==2 and tg[0] == '+':
            res.append(tg[1])
            res.append(src)
        else:
            res.append(tg)
    res = ''.join(res)
    return res.strip('#')

def errors(fullname, fullname_true):
    fullname = '#' + fullname + "#"
    fullname_true = '#' + fullname_true + "#"
    target = []
    edit_opts = Levenshtein.editops(fullname_true, fullname)
    edit_opts = sorted(edit_opts, key=lambda x: (x[0], -x[1]), reverse=True)
    for op, src, dst in edit_opts:
        if op == 'delete':
            target.append(fullname_true[src]+'>--')
        if op == 'replace':
            target.append(fullname_true[src]+'>'+fullname[dst])
        if op == 'insert':
            target.append(fullname_true[src]+'>'+fullname[dst]+fullname_true[src])
    return target

In [None]:
train_01_ru_error = train_01_ru_train[train_01_ru_train.target==1].reset_index(drop=True)

dict_err = dict()

for itr in range(len(train_01_ru_error)):
    for err in errors(train_01_ru_error.fullname[itr],train_01_ru_error.fullname_true[itr]):
        fr, to = err.split('>')
        if fr not in dict_err:
            dict_err[fr] = defaultdict(int)
            dict_err[fr][to] += 1
        else:
            dict_err[fr][to] += 1

dict_sum_freq = {fr:sum(dict_err[fr].values()) for fr in dict_err}
dict_err_freq = {fr:{to:dict_err[fr][to]/dict_sum_freq[fr] for to in dict_err[fr]} for fr in dict_err}

In [None]:
import random
random.seed(42)

def make_error(fullname_true, dict_err_freq=dict_err_freq):
    fullname_true = fullname_true + '#'
    err_place = random.randint(0,len(fullname_true)-1)
    err_variation = dict_err_freq[fullname_true[err_place]]
    err = random.choices(list(err_variation.keys()), list(err_variation.values()))[0]
    if err == '--':
        fullname_error = fullname_true[:err_place] + fullname_true[err_place+1:]
    else:
        fullname_error = fullname_true[:err_place] + err + fullname_true[err_place+1:]
    
    return re.sub('#','',fullname_error)

In [None]:
fullname_true_ru = list(train_01_ru_train.loc[train_01_ru_train.target==0,'fullname']) + list(train_01_ru_train.loc[train_01_ru_train.target==1,'fullname_true'])

train_01_ru_augment = pd.DataFrame.from_dict({'fullname_true':fullname_true_ru})
train_01_ru_augment['fullname'] = np.nan
train_01_ru_augment['country'] = np.nan
train_01_ru_augment['target'] = 1
train_01_ru_augment['fullname'] = train_01_ru_augment['fullname_true'].apply(make_error)
train_01_ru_augment = train_01_ru_augment[['fullname','country','target','fullname_true']]
print(train_01_ru_augment.shape)
train_01_ru_augment.head()

In [None]:
train_01_ru_full = train_01_ru_train[['fullname','country','target','fullname_true']].append(train_01_ru_augment)
train_01_ru_full = train_01_ru_full.sample(frac=1).reset_index(drop=True)
train_01_ru_full['id'] = train_01_ru_full.index + 1
train_01_ru_full = train_01_ru_full[['id','fullname','country','target','fullname_true']]
print(train_01_ru_full.shape)
print(train_01_ru_full.target.value_counts())
train_01_ru_full.head()

In [None]:
pd.DataFrame.to_csv(train_01_ru_full, data_dir+'/train_ru_aug.csv',index=None)
pd.DataFrame.to_csv(train_01_ru_valid[['id','fullname','country','target','fullname_true']], data_dir+'/valid_ru.csv',index=None)

#### EN

In [None]:
train_01_en = train_01_lay[np.logical_and(train_01_lay.fullname_lay=='EN',np.logical_or(train_01_lay.fullname_true_lay=='EN',pd.isnull(train_01_lay.fullname_true_lay)))].reset_index(drop=True)
train_01_en = train_01_en[['id','fullname','country','target','fullname_true']]

print(train_01_en.shape)
train_01_en.head()

In [None]:
train_01_en.target.value_counts()

In [None]:
train_01_en_train = train_01_en[:int(0.8*len(train_01_en))].reset_index(drop=True)
train_01_en_valid = train_01_en[int(0.8*len(train_01_en)):].reset_index(drop=True)
print(train_01_en_train.shape,train_01_en_valid.shape)

In [None]:
import Levenshtein
from collections import Counter, defaultdict

def encode(fullname, fullname_true):
    fullname = '#' + fullname + "#"
    fullname_true = '#' + fullname_true + "#"
    target = [''] * len(fullname)
    edit_opts = Levenshtein.editops(fullname, fullname_true)
    edit_opts = sorted(edit_opts, key=lambda x: (x[0], -x[1]), reverse=True)
    for op, src, dst in edit_opts:
        if op == 'delete':
            target[src] = '--'
        if op == 'replace':
            target[src] = fullname_true[dst]
        if op == 'insert':
            target[src] = '+' + fullname_true[dst]
    return target

def restore(fullname, target):
    fullname = '#' + fullname + "#"
    res = []
    for src, tg in zip(fullname, target):
        if tg == '':
            res.append(src)
        elif tg == '--':
            pass
        elif len(tg) ==2 and tg[0] == '+':
            res.append(tg[1])
            res.append(src)
        else:
            res.append(tg)
    res = ''.join(res)
    return res.strip('#')

def errors(fullname, fullname_true):
    fullname = '#' + fullname + "#"
    fullname_true = '#' + fullname_true + "#"
    target = []
    edit_opts = Levenshtein.editops(fullname_true, fullname)
    edit_opts = sorted(edit_opts, key=lambda x: (x[0], -x[1]), reverse=True)
    for op, src, dst in edit_opts:
        if op == 'delete':
            target.append(fullname_true[src]+'>--')
        if op == 'replace':
            target.append(fullname_true[src]+'>'+fullname[dst])
        if op == 'insert':
            target.append(fullname_true[src]+'>'+fullname[dst]+fullname_true[src])
    return target

In [None]:
train_01_en_error = train_01_en_train[train_01_en_train.target==1].reset_index(drop=True)

dict_err = dict()

for itr in range(len(train_01_en_error)):
    for err in errors(train_01_en_error.fullname[itr],train_01_en_error.fullname_true[itr]):
        fr, to = err.split('>')
        if fr not in dict_err:
            dict_err[fr] = defaultdict(int)
            dict_err[fr][to] += 1
        else:
            dict_err[fr][to] += 1

dict_sum_freq = {fr:sum(dict_err[fr].values()) for fr in dict_err}
dict_err_freq = {fr:{to:dict_err[fr][to]/dict_sum_freq[fr] for to in dict_err[fr]} for fr in dict_err}

import random
random.seed(42)

def make_error(fullname_true, dict_err_freq=dict_err_freq):
    fullname_true = fullname_true + '#'
    err_place = random.randint(0,len(fullname_true)-1)
    err_variation = dict_err_freq[fullname_true[err_place]]
    err = random.choices(list(err_variation.keys()), list(err_variation.values()))[0]
    if err == '--':
        fullname_error = fullname_true[:err_place] + fullname_true[err_place+1:]
    else:
        fullname_error = fullname_true[:err_place] + err + fullname_true[err_place+1:]
    
    return re.sub('#','',fullname_error)

fullname_true_en = list(train_01_en_train.loc[train_01_en_train.target==0,'fullname']) + list(train_01_en_train.loc[train_01_en_train.target==1,'fullname_true'])

train_01_en_augment = pd.DataFrame.from_dict({'fullname_true':fullname_true_en})
train_01_en_augment['fullname'] = np.nan
train_01_en_augment['country'] = np.nan
train_01_en_augment['target'] = 1
train_01_en_augment['fullname'] = train_01_en_augment['fullname_true'].apply(make_error)
train_01_en_augment['id'] = train_01_en_augment.index + 1
train_01_en_augment = train_01_en_augment[['id','fullname','country','target','fullname_true']]
print(train_01_en_augment.shape)
train_01_en_augment.head()

In [None]:
import transliterate

def translit(ru):
    return re.sub("'",'',transliterate.translit(ru, 'ru', reversed=True).upper())


train_01_en_translit = train_01_ru.copy()
train_01_en_translit.fullname = train_01_en_translit.fullname.apply(translit)
train_01_en_translit.loc[~pd.isnull(train_01_en_translit.fullname_true),'fullname_true'] = train_01_en_translit.loc[~pd.isnull(train_01_en_translit.fullname_true),'fullname_true'].apply(translit)
print(train_01_en_translit.shape)
train_01_en_translit.head(20)

In [None]:
train_01_en_full = train_01_en_train.append([train_01_en_augment, train_01_en_translit[['id','fullname','country','target','fullname_true']]])
train_01_en_full = train_01_en_full.sample(frac=1).reset_index(drop=True)
train_01_en_full['id'] = train_01_en_full.index + 1
print(train_01_en_full.shape)
print(train_01_en_full.target.value_counts())
train_01_en_full.head()

In [None]:
pd.DataFrame.to_csv(train_01_en_full,data_dir+'/train_en_aug.csv',index=False)
pd.DataFrame.to_csv(train_01_en_valid,data_dir+'/valid_en.csv',index=False)