In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
from tqdm import tqdm
tqdm.pandas()
import re
import numpy as np
from unidecode import unidecode
from itertools import permutations

from pyphonetics import RefinedSoundex, FuzzySoundex
from phonetics import dmetaphone, nysiis
from textdistance import Levenshtein, DamerauLevenshtein as DL, JaroWinkler as JW, SmithWaterman as SW, NeedlemanWunsch as NW
from textdistance import Jaccard, Sorensen, Overlap, Bag
from textdistance import Editex
from textdistance import LCSSeq, LCSStr
from textdistance import BZ2NCD, ZLIBNCD

from sklearn.metrics import average_precision_score

In [2]:
# read files
with open('titles.txt') as f:
    titles = f.read()
titles = titles.split()

df_train_match = pd.read_csv('data/train_names_matches.csv', index_col=None)
df_train_mismatch = pd.read_csv('data/train_names_mismatches.csv', index_col=[0])
df_val_match = pd.read_csv('data/val_names_matches.csv', index_col=None)
df_val_mismatch = pd.read_csv('data/val_names_mismatches.csv', index_col=[0])

### Preprocessing

In [3]:
def preprocess_names(names):
    preprocessed_names = []
    for name in names:
        name = str(name).lower().strip()
        name = ' '.join(name.split())
        name = unidecode(name)
        name = re.sub('[^a-z ]', '', name)
        name = ' '.join([part for part in name.split() if part not in titles])
        preprocessed_names.append(name)
    return preprocessed_names

df_train_match[['name1', 'name2']] = df_train_match[['name1', 'name2']].progress_apply(lambda vec: preprocess_names(vec), axis=1, result_type='expand')
df_train_mismatch[['name1', 'name2']] = df_train_mismatch[['name1', 'name2']].progress_apply(lambda vec: preprocess_names(vec), axis=1, result_type='expand')
df_val_match[['name1', 'name2']] = df_val_match[['name1', 'name2']].progress_apply(lambda vec: preprocess_names(vec), axis=1, result_type='expand')
df_val_mismatch[['name1', 'name2']] = df_val_mismatch[['name1', 'name2']].progress_apply(lambda vec: preprocess_names(vec), axis=1, result_type='expand')

100%|█████████████████████████████████████████████████████████████████████████| 59120/59120 [00:01<00:00, 41986.38it/s]
100%|███████████████████████████████████████████████████████████████████████| 104876/104876 [00:02<00:00, 42814.89it/s]
100%|█████████████████████████████████████████████████████████████████████████| 14150/14150 [00:00<00:00, 38991.71it/s]
100%|█████████████████████████████████████████████████████████████████████████| 19416/19416 [00:00<00:00, 44018.76it/s]


In [11]:
def pad_name(name, qval):
    name = ''.join((['<']*(qval-1))) + name + ''.join((['>']*(qval-1)))
    return name

def get_similarity(names, sim_func, **kwargs):
    if 'qval' in kwargs:
        names = [pad_name(name, kwargs.get('qval', 1)) for name in names]
    return sim_func(*names, **kwargs)

# phonetic
def soundex(name1, name2):
    rs = RefinedSoundex()
    return 1 - rs.distance(name1, name2, metric='levenshtein')/max(len(name1), len(name2))

def double_metaphone(name1, name2):
    p1 = set([p for p in dmetaphone(name1) if p!=''])
    p2 = set([p for p in dmetaphone(name2) if p!=''])
    return len(p1.intersection(p2))/len(p1.union(p2))

# token based
def jaccard(name1, name2, qval):
    jc = Jaccard(qval)
    return jc(name1, name2)

def sorenson(name1, name2, qval):
    sn = Sorensen(qval)
    return sn(name1, name2)

def overlap(name1, name2, qval):
    ov = Overlap(qval)
    return ov(name1, name2)

def bag(name1, name2):
    bag = Bag()
    return 1 - bag(name1, name2)/max(len(name1), len(name2))

# edit distance
def levenshtein(name1, name2):
    lev = Levenshtein()
    return 1 - lev(name1, name2)/max(len(name1), len(name2))

def dlevenshtein(name1, name2):
    dl = DL()
    return 1 - dl(name1, name2)/max(len(name1), len(name2))

# alignment scores
def smith_waterman(name1, name2):
    sw = SW()
    return sw(name1, name2)/max(len(name1), len(name2))

def needleman_wunsch(name1, name2):
    nw = NW()
    return nw(name1, name2)/max(len(name1), len(name2))

def jaro_winkler(name1, name2):
    jw = JW()
    return jw(name1, name2)

# combined
def editex(name1, name2):
    ed = Editex()
    return 1 - ed(name1, name2)/(2*max(len(name1), len(name2)))

#sequence based
def lcsseq(name1, name2):
    lsq = LCSSeq()
    return len(lsq(name1, name2))/max(len(name1), len(name2))

def lcsstr(name1, name2):
    lsr = LCSStr()
    return len(lsr(name1, name2))/max(len(name1), len(name2))

# compression based
def bz2ncd(name1, name2):
    bzn = BZ2NCD()
    return 1 - bzn(name1, name2)

def zlibncd(name1, name2):
    zzn = ZLIBNCD()
    return 1 - zzn(name1, name2)

In [5]:
def sample_data(df_match, df_mismatch, count):
    df_match = df_match[(df_match['name1']!=df_match['name2'])&(df_match['name1']!='')&(df_match['name2']!='')]
    df_match = df_match.reset_index(drop=True).sample(n=count//2, random_state=42)
    df_mismatch = df_mismatch.rename(columns={'mismatch': 'variations'})
    df_mismatch = df_mismatch[(df_mismatch['name1']!=df_mismatch['name2'])&(df_mismatch['name1']!='')&(df_mismatch['name2']!='')]
    df_mismatch = df_mismatch.reset_index(drop=True).sample(n=count//2, random_state=42)
    df_match['label'] = 1
    df_mismatch['label'] = 0
    df = pd.concat([df_match, df_mismatch], axis=0).reset_index(drop=True)
    return df

df_train = sample_data(df_train_match, df_train_mismatch, 20000)
df_val = sample_data(df_val_match, df_val_mismatch, 6000)

In [6]:
def get_permutations(name):
    name_permutations = [' '.join(parts_of_name) for parts_of_name in set(permutations(name.split()))]
    return name_permutations

def get_alignments(names):
    name1_permutations = get_permutations(names[0])
    name2_permutations = get_permutations(names[1])
    alignment_budget = 10
    n_alignments_name1 = int(np.ceil(alignment_budget*len(name1_permutations)/(len(name1_permutations)+len(name2_permutations))))
    n_alignments_name2 = int(np.ceil(alignment_budget*len(name2_permutations)/(len(name1_permutations)+len(name2_permutations))))
    return name1_permutations[0:n_alignments_name1], name2_permutations[0:n_alignments_name2]

def get_best_alignment(names):
    alignments_name1, alignments_name2 = get_alignments(names)
    name_pairs = list(zip(*[arr.flatten() for arr in np.meshgrid(alignments_name1, alignments_name2)]))
    name_pairs_sorted = sorted(name_pairs, key=lambda vec: needleman_wunsch(vec[0], vec[1]), reverse=True)
    return name_pairs_sorted[0]

df_train[['name1', 'name2']] = df_train[['name1', 'name2']].progress_apply(get_best_alignment, axis=1, result_type='expand')
df_val[['name1', 'name2']] = df_val[['name1', 'name2']].progress_apply(get_best_alignment, axis=1, result_type='expand')

100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [00:36<00:00, 541.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6000/6000 [00:09<00:00, 614.90it/s]


### Feature Generation

In [7]:
def generate_features(df):
    df['soundex'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(soundex,), axis=1)
    df['double_metaphone'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(double_metaphone,), axis=1)
    df['jaccard_2'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(jaccard,), qval=2, axis=1)
    df['jaccard_3'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(jaccard,), qval=3, axis=1)
    df['sorenson_2'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(sorenson,), qval=2, axis=1)
    df['sorenson_3'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(sorenson,), qval=3, axis=1)
    df['overlap_2'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(overlap,), qval=2, axis=1)
    df['overlap_3'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(overlap,), qval=3, axis=1)
    df['bag'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(bag,), axis=1)
    df['levenshtein'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(levenshtein,), axis=1)
    df['dlevenshtein'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(dlevenshtein,), axis=1)
    df['jaro_winkler'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(jaro_winkler,), axis=1)
    df['smith_waterman'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(smith_waterman,), axis=1)
    df['editex'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(editex,), axis=1)
    df['lcsseq'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(lcsseq,), axis=1)
    df['lcsstr'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(lcsstr,), axis=1)
    df['bz2ncd'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(bz2ncd,), axis=1)
    df['zlibncd'] = df[['name1', 'name2']].progress_apply(get_similarity, args=(zlibncd,), axis=1)
    return df

df_train = generate_features(df_train)
df_val = generate_features(df_val)

100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:01<00:00, 15565.25it/s]
100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 26438.17it/s]
100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 23312.67it/s]
100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 23499.54it/s]
100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 27678.15it/s]
100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 26130.11it/s]
100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 29273.27it/s]
100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 26025.07it/s]
100%|███████████████████████████████████

In [14]:
df_train.to_csv('data/df_train.csv')
df_val.to_csv('data/df_val.csv')

### Feature Analysis
1. Univariate Analysis

In [None]:
similarity_measures = ['soundex', 'double_metaphone', 'jaccard_2', 'jaccard_3', 'sorenson_2', 'sorenson_3', 'overlap_2', 'overlap_3', 'bag', \
                       'levenshtein', 'dlevenshtein', 'jaro_winkler', 'smith_waterman', 'editex', 'lcsseq', 'lcsstr', 'bz2ncd', 'zlibncd']
variations = ['ocr_kte', 'pen', 'wm', 'ocr', 'wj', 'ab_kte', 'alt_tse', 'alt', 'kte', 'se', 'tse_kte', 'wm_kte',\
              'ab', 'se_kte', 'pen_kte', 'wj_kte', 'he', 'alt_kte', 'tse', 'ocr_tse', 'he_kte']

In [47]:
df_results = pd.DataFrame({})
df_mismatch = df_train[df_train['label']==0].reset_index(drop=True)
for var in variations:
    for sim in similarity_measures:
        data = pd.concat([df_train[df_train['variations']==var], df_mismatch], ignore_index=True)
        df_results.loc[sim, var] = average_precision_score(data['label'], data[sim])

In [52]:
df_results.to_csv('output/univariate_analysis_train.csv')

2. Multivariate Analysis

In [63]:
df_corr = df_train[similarity_measures].corr()
df_corr.to_csv('output/correlation_train.csv')