In [1]:
import pandas as pd
pd.set_option('display.max_rows', 20)
from unidecode import unidecode
from sklearn.model_selection import train_test_split
from name_variation import NameVariationGenerator
import re
from tqdm import tqdm
tqdm.pandas()
import Levenshtein
from itertools import combinations, chain
import json

In [2]:
def read_json(path: str):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

In [3]:
def preprocessing_name(dataset, *name_cols):
    for col in name_cols:
        dataset[col] = dataset[col].apply(lambda x: re.sub('[0-9]', '', unidecode(x).lower().strip()))
        dataset = dataset[dataset[col].apply(lambda x: len(x)>=2)].reset_index(drop=True)
    return dataset

### Names by country (Wikidata - List of most popular first and last names by country)

In [75]:
df_f = pd.read_csv('data/common-forenames-by-country.csv')
df_f = df_f[['Romanized Name', 'Country']].astype(str).rename(columns={'Romanized Name': 'FirstName', 'Country': 'country'}).drop_duplicates(ignore_index=True)
df_s = pd.read_csv('data/common-surnames-by-country.csv')
df_s = df_s[['Romanized Name', 'Country']].astype(str).rename(columns={'Romanized Name': 'LastName', 'Country': 'country'}).drop_duplicates(ignore_index=True)

In [76]:
# preprocessing
df_f = preprocessing_name(df_f, 'FirstName', 'country')
df_s = preprocessing_name(df_s, 'LastName', 'country')

In [77]:
# countries with both forenames and surnames
df_f_bc = df_f.merge(df_s[['country']].drop_duplicates(), on='country', how='inner')
df_s_bc = df_s.merge(df_f[['country']].drop_duplicates(), on='country', how='inner')

In [78]:
# splitting train and test givennames and sirnames
train_f, val_f = train_test_split(df_f_bc, test_size=0.3, random_state=42, shuffle=True, stratify=df_f_bc['country'])
train_s, val_s = train_test_split(df_s_bc, test_size=0.3, random_state=42, shuffle=True, stratify=df_s_bc['country'])

In [79]:
# remove accented characters and lowercase full names by joining givennames and sirnames by country
train_names_by_country = train_f.merge(train_s, on='country')
train_names_by_country['name'] = train_names_by_country['FirstName'] + ' ' + train_names_by_country['LastName']
val_names_by_country = val_f.merge(val_s, on='country')
val_names_by_country['name'] = val_names_by_country['FirstName'] + ' ' + val_names_by_country['LastName']

In [9]:
name_variation_generator = NameVariationGenerator()
train_names = pd.DataFrame({})
val_names = pd.DataFrame({})

In [10]:
for variations in ('se', 'ab', 'wj', 'wm', 'tse', 'kte', \
                   'se_kte', 'ab_kte', 'wj_kte', 'wm_kte', 'tse_kte'):
    print(train_names.shape)
    train_names = pd.concat([train_names, name_variation_generator.\
                             generate_name_variations(train_names_by_country, variations, 0.1)
                            ])

(0, 0)


100%|██████████████████████████████████████████████████████████████████████████| 2621/2621 [00:00<00:00, 152204.45it/s]


Done, Variations = se
(2621, 3)


100%|███████████████████████████████████████████████████████████████████████████| 2621/2621 [00:00<00:00, 54646.94it/s]


Done, Variations = ab
(5242, 3)


100%|██████████████████████████████████████████████████████████████████████████| 2621/2621 [00:00<00:00, 120976.67it/s]


Done, Variations = wj
(7863, 3)


100%|███████████████████████████████████████████████████████████████████████████| 2621/2621 [00:00<00:00, 54516.05it/s]

Done, Variations = wm
(10484, 3)



100%|██████████████████████████████████████████████████████████████████████████| 2621/2621 [00:00<00:00, 132871.67it/s]


Done, Variations = tse
(13105, 3)


100%|█████████████████████████████████████████████████████████████████████████████| 2621/2621 [00:10<00:00, 241.03it/s]


Done, Variations = kte
(15726, 3)


100%|█████████████████████████████████████████████████████████████████████████████| 2621/2621 [00:10<00:00, 242.02it/s]


Done, Variations = se_kte
(18347, 3)


100%|█████████████████████████████████████████████████████████████████████████████| 2621/2621 [00:10<00:00, 238.95it/s]


Done, Variations = ab_kte
(20968, 3)


100%|█████████████████████████████████████████████████████████████████████████████| 2621/2621 [00:11<00:00, 234.85it/s]


Done, Variations = wj_kte
(23589, 3)


100%|█████████████████████████████████████████████████████████████████████████████| 2621/2621 [00:10<00:00, 241.16it/s]


Done, Variations = wm_kte
(26210, 3)


100%|█████████████████████████████████████████████████████████████████████████████| 2621/2621 [00:10<00:00, 238.33it/s]

Done, Variations = tse_kte





In [11]:
for variations in ('se', 'ab', 'wj', 'wm', 'tse', 'kte', \
                   'se_kte', 'ab_kte', 'wj_kte', 'wm_kte', 'tse_kte'):
    val_names = pd.concat([val_names, name_variation_generator.\
                           generate_name_variations(val_names_by_country, variations, 0.1)
                          ])

100%|█████████████████████████████████████████████████████████████████████████████| 485/485 [00:00<00:00, 96974.66it/s]


Done, Variations = se


100%|█████████████████████████████████████████████████████████████████████████████| 485/485 [00:00<00:00, 49996.00it/s]


Done, Variations = ab


100%|█████████████████████████████████████████████████████████████████████████████| 485/485 [00:00<00:00, 80570.24it/s]


Done, Variations = wj


100%|█████████████████████████████████████████████████████████████████████████████| 485/485 [00:00<00:00, 26863.84it/s]


Done, Variations = wm


100%|█████████████████████████████████████████████████████████████████████████████| 485/485 [00:00<00:00, 86244.01it/s]


Done, Variations = tse


100%|███████████████████████████████████████████████████████████████████████████████| 485/485 [00:02<00:00, 235.81it/s]


Done, Variations = kte


100%|███████████████████████████████████████████████████████████████████████████████| 485/485 [00:02<00:00, 216.45it/s]


Done, Variations = se_kte


100%|███████████████████████████████████████████████████████████████████████████████| 485/485 [00:02<00:00, 234.74it/s]


Done, Variations = ab_kte


100%|███████████████████████████████████████████████████████████████████████████████| 485/485 [00:02<00:00, 226.96it/s]


Done, Variations = wj_kte


100%|███████████████████████████████████████████████████████████████████████████████| 485/485 [00:02<00:00, 229.79it/s]


Done, Variations = wm_kte


100%|███████████████████████████████████████████████████████████████████████████████| 485/485 [00:02<00:00, 232.64it/s]

Done, Variations = tse_kte





### OCR dataset (Kaggle Handwritten names dataset + pyteserract 4.0 model)

In [12]:
# read dataset
df_ocr = pd.read_csv('data/ocr_names.csv', index_col=0).dropna().drop_duplicates(ignore_index=True)

# preprocessing
df_ocr = preprocessing_name(df_ocr, 'ocr_name', 'true_name')
df_ocr = df_ocr[df_ocr['true_name']!=df_ocr['ocr_name']].reset_index(drop=True)

# splitting train and test ocrnames
train_names_ocr, val_names_ocr = train_test_split(df_ocr, test_size=0.2, random_state=42, shuffle=True)

In [13]:
train_names_ocr['variations'] = 'ocr'
train_names = pd.concat([train_names, train_names_ocr.rename(
                             columns={'true_name': 'name1', 'ocr_name': 'name2'}
                         )[['name1', 'name2', 'variations']]])
val_names_ocr['variations'] = 'ocr'
val_names = pd.concat([val_names, val_names_ocr.rename(
                             columns={'true_name': 'name1', 'ocr_name': 'name2'}
                         )[['name1', 'name2', 'variations']]])

In [14]:
train_names_ocr['orig_error_length'] = train_names_ocr[['true_name', 'ocr_name']].progress_apply(lambda vec: Levenshtein.distance(vec[0], vec[1]), axis=1)
for variations in ('tse', 'kte'):
    train_names = pd.concat([train_names, name_variation_generator.\
                           generate_name_variations(train_names_ocr, variations, 0.1, 'ocr_name', \
                                                    'true_name', 'ocr', 'orig_error_length')
                          ])

100%|██████████████████████████████████████████████████████████████████████████| 2395/2395 [00:00<00:00, 117081.49it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 239/239 [00:00<00:00, 78413.54it/s]


Done, Variations = ocr_tse


100%|███████████████████████████████████████████████████████████████████████████████| 239/239 [00:00<00:00, 419.40it/s]

Done, Variations = ocr_kte





In [15]:
val_names_ocr['orig_error_length'] = val_names_ocr[['true_name', 'ocr_name']].progress_apply(lambda vec: Levenshtein.distance(vec[0], vec[1]), axis=1)
for variations in ('tse', 'kte'):
    val_names = pd.concat([val_names, name_variation_generator.\
                           generate_name_variations(train_names_ocr, variations, 0.1, 'ocr_name', \
                                                    'true_name', 'ocr', 'orig_error_length')
                          ])

100%|█████████████████████████████████████████████████████████████████████████████| 599/599 [00:00<00:00, 92255.28it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 239/239 [00:00<00:00, 59854.23it/s]


Done, Variations = ocr_tse


100%|███████████████████████████████████████████████████████████████████████████████| 239/239 [00:00<00:00, 414.94it/s]

Done, Variations = ocr_kte





### Alternative names dataset - Wikidata (Language/Spellings/Regional/Phonetic variations)

In [16]:
# read dataset
alt_names_data = [read_json('data/results.json')]
alt_names_data.extend(list(read_json('data/results_lang.json').values()))

# preprocessing
def preprocessing_alt_names(alt_names_data):
    alt_names_pairs_data = []
    for alt_names_json in alt_names_data:
        alt_names_pairs = [(name_dict['nameLabelString']['value'], name_dict['nameAltLabelString']['value']) \
                       for name_dict in alt_names_json['results']['bindings']]
        alt_names_pairs_data.extend(alt_names_pairs)    
    df_alt = pd.DataFrame(alt_names_pairs_data, columns=['true_name', 'alt_name'])    
    df_alt = preprocessing_name(df_alt, 'alt_name', 'true_name')
    df_alt = df_alt[df_alt['true_name']!=df_alt['alt_name']].reset_index(drop=True)
    return df_alt
df_alt = preprocessing_alt_names(alt_names_data)

# splitting train and test transliterated names
train_names_alt, val_names_alt = train_test_split(df_alt, test_size=0.2, random_state=42, shuffle=True)

In [17]:
train_names_alt['variations'] = 'alt'
train_names = pd.concat([train_names, train_names_alt.rename(
                             columns={'true_name': 'name1', 'alt_name': 'name2'}
                         )[['name1', 'name2', 'variations']]])
val_names_alt['variations'] = 'alt'
val_names = pd.concat([val_names, val_names_alt.rename(
                             columns={'true_name': 'name1', 'alt_name': 'name2'}
                         )[['name1', 'name2', 'variations']]])

In [18]:
train_names_alt['orig_error_length'] = train_names_alt[['true_name', 'alt_name']].progress_apply(lambda vec: Levenshtein.distance(vec[0], vec[1]), axis=1)
for variations in ('tse', 'kte'):
    train_names = pd.concat([train_names, name_variation_generator.\
                           generate_name_variations(train_names_alt, variations, 0.1, 'alt_name', \
                                                    'true_name', 'alt', 'orig_error_length')
                          ])

100%|████████████████████████████████████████████████████████████████████████| 10375/10375 [00:00<00:00, 145802.44it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1037/1037 [00:00<00:00, 118208.81it/s]


Done, Variations = alt_tse


100%|█████████████████████████████████████████████████████████████████████████████| 1037/1037 [00:02<00:00, 490.36it/s]

Done, Variations = alt_kte





In [19]:
val_names_alt['orig_error_length'] = val_names_alt[['true_name', 'alt_name']].progress_apply(lambda vec: Levenshtein.distance(vec[0], vec[1]), axis=1)
for variations in ('tse', 'kte'):
    val_names = pd.concat([val_names, name_variation_generator.\
                           generate_name_variations(val_names_alt, variations, 0.1, 'alt_name', \
                                                    'true_name', 'alt', 'orig_error_length')
                          ])

100%|██████████████████████████████████████████████████████████████████████████| 2594/2594 [00:00<00:00, 158077.86it/s]
100%|████████████████████████████████████████████████████████████████████████████| 259/259 [00:00<00:00, 129679.45it/s]


Done, Variations = alt_tse


100%|███████████████████████████████████████████████████████████████████████████████| 259/259 [00:00<00:00, 498.64it/s]

Done, Variations = alt_kte





### Nick/Pen names dataset (https://github.com/carltonnorthern/nicknames/blob/master/names.csv)

In [20]:
# read dataset
with open('data/nicknames.txt', 'r') as f:
    names = f.readlines()

def preprocess_nicknames(names):
    names_list = [nicknames.strip().split(',') for nicknames in names]
    pair_names = chain.from_iterable([list(combinations(nicknames,2)) for nicknames in names_list])
    df_pen_1 = pd.DataFrame(pair_names, columns=['true_name', 'pen_name'])
    df_pen_2 = pd.DataFrame(pair_names, columns=['pen_name', 'true_name'])
    df_pen = pd.concat([df_pen_1, df_pen_2], ignore_index=True)
    df_pen = preprocessing_name(df_pen, 'pen_name', 'true_name')
    return df_pen

#preprocessing
df_pen = preprocess_nicknames(names)
df_pen = df_pen[df_pen['true_name']!=df_pen['pen_name']].reset_index(drop=True)

# splitting train and test transliterated names
train_names_pen, val_names_pen = train_test_split(df_pen, test_size=0.3, random_state=42, shuffle=True)

In [21]:
train_names_pen['variations'] = 'pen'
train_names = pd.concat([train_names, train_names_pen.rename(
                             columns={'true_name': 'name1', 'pen_name': 'name2'}
                         )[['name1', 'name2', 'variations']]])
val_names_pen['variations'] = 'pen'
val_names = pd.concat([val_names, val_names_pen.rename(
                             columns={'true_name': 'name1', 'pen_name': 'name2'}
                         )[['name1', 'name2', 'variations']]])

In [22]:
train_names_pen['orig_error_length'] = train_names_pen[['true_name', 'pen_name']].progress_apply(lambda vec: Levenshtein.distance(vec[0], vec[1]), axis=1)
train_names = pd.concat([train_names, name_variation_generator.\
                       generate_name_variations(train_names_pen, 'kte', 0.1, 'pen_name', 'true_name', 'pen', 'orig_error_length')
                      ])

100%|██████████████████████████████████████████████████████████████████████████| 4519/4519 [00:00<00:00, 141761.35it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 451/451 [00:00<00:00, 522.31it/s]

Done, Variations = pen_kte





In [23]:
val_names_pen['orig_error_length'] = val_names_pen[['true_name', 'pen_name']].progress_apply(lambda vec: Levenshtein.distance(vec[0], vec[1]), axis=1)
val_names = pd.concat([val_names, name_variation_generator.\
                       generate_name_variations(val_names_pen, 'kte', 0.1, 'pen_name', 'true_name', 'pen', 'orig_error_length')
                      ])

100%|██████████████████████████████████████████████████████████████████████████| 1937/1937 [00:00<00:00, 143426.02it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 193/193 [00:00<00:00, 620.52it/s]

Done, Variations = pen_kte





### Compound names dataset (Wikidata)

In [24]:
# read dataset
compound_names_json = read_json('data/results_compound_names.json')

# preprocessing
def preprocessing_compound_names(compound_names_json):
    compound_names_data = [(names_dict['nameLabel']['value'],) for names_dict in compound_names_json['results']['bindings']]
    df_cn = pd.DataFrame(compound_names_data, columns=['compound_name'])
    df_cn = preprocessing_name(df_cn, 'compound_name')
    df_cn = df_cn.dropna().drop_duplicates(ignore_index=True)
    return df_cn
df_cn = preprocessing_compound_names(compound_names_json)

# splitting train and test transliterated names
train_names_cn, val_names_cn = train_test_split(df_cn, test_size=0.2, random_state=42, shuffle=True)

In [25]:
for variations in ('he', 'se', 'ab', 'wj', 'wm', 'tse', 'kte'\
                   , 'se_kte', 'ab_kte', 'wj_kte', 'wm_kte', 'tse_kte', 'he_kte'):
    train_names = pd.concat([train_names, name_variation_generator.\
                           generate_name_variations(train_names_cn, variations, 0.1, 'compound_name')
                          ])

100%|████████████████████████████████████████████████████████████████████████████| 769/769 [00:00<00:00, 112808.47it/s]


Done, Variations = he


100%|█████████████████████████████████████████████████████████████████████████████| 769/769 [00:00<00:00, 97203.90it/s]


Done, Variations = se


100%|█████████████████████████████████████████████████████████████████████████████| 769/769 [00:00<00:00, 33848.82it/s]


Done, Variations = ab


100%|█████████████████████████████████████████████████████████████████████████████| 769/769 [00:00<00:00, 84459.39it/s]


Done, Variations = wj


100%|████████████████████████████████████████████████████████████████████████████| 769/769 [00:00<00:00, 144391.61it/s]


Done, Variations = wm


100%|████████████████████████████████████████████████████████████████████████████| 769/769 [00:00<00:00, 161878.03it/s]


Done, Variations = tse


100%|███████████████████████████████████████████████████████████████████████████████| 769/769 [00:03<00:00, 212.80it/s]


Done, Variations = kte


100%|███████████████████████████████████████████████████████████████████████████████| 769/769 [00:03<00:00, 202.63it/s]


Done, Variations = se_kte


100%|███████████████████████████████████████████████████████████████████████████████| 769/769 [00:03<00:00, 198.81it/s]


Done, Variations = ab_kte


100%|███████████████████████████████████████████████████████████████████████████████| 769/769 [00:03<00:00, 200.52it/s]


Done, Variations = wj_kte


100%|███████████████████████████████████████████████████████████████████████████████| 769/769 [00:03<00:00, 226.95it/s]


Done, Variations = wm_kte


100%|███████████████████████████████████████████████████████████████████████████████| 769/769 [00:03<00:00, 230.88it/s]


Done, Variations = tse_kte


100%|███████████████████████████████████████████████████████████████████████████████| 769/769 [00:03<00:00, 228.34it/s]

Done, Variations = he_kte





In [26]:
for variations in ('he', 'se', 'ab', 'wj', 'wm', 'tse', 'kte', \
                   'se_kte', 'ab_kte', 'wj_kte', 'wm_kte', 'tse_kte', 'he_kte'):
    val_names = pd.concat([val_names, name_variation_generator.\
                           generate_name_variations(val_names_cn, variations, 0.1, 'compound_name')
                          ])

100%|████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 101782.91it/s]


Done, Variations = he


100%|████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 192197.22it/s]


Done, Variations = se


100%|█████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 27079.13it/s]


Done, Variations = ab


100%|█████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 95744.43it/s]


Done, Variations = wj


100%|█████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 18751.60it/s]


Done, Variations = wm


100%|█████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 63979.21it/s]


Done, Variations = tse


100%|███████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 227.54it/s]


Done, Variations = kte


100%|███████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 219.57it/s]


Done, Variations = se_kte


100%|███████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 216.02it/s]


Done, Variations = ab_kte


100%|███████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 216.80it/s]


Done, Variations = wj_kte


100%|███████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 199.21it/s]


Done, Variations = wm_kte


100%|███████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 207.12it/s]


Done, Variations = tse_kte


100%|███████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 214.69it/s]

Done, Variations = he_kte





In [51]:
train_names.to_csv('data/train_names_matches.csv', index=False)
val_names.to_csv('data/val_names_matches.csv', index=False)

#errors
se
ab
wj
wm
kte
tse

#datasets
train_names_by_country
val_names_by_country
ocr->single names - train, val
transliteration(phonetic variations)->single names - train, val
nicknames - train, val
compound_names

#level 1 - can be handled by control mechanism (on by_country and cn)
se
ab
wj
wm
tse
nicknames_dataset
he(only_cn)

#level 2 - single error source, error length controllable based on distribution (on by_country and cn)
kte

#level 3 - single error source, error length non-controllable, predefined in dataset 
ocr->sn
transliteration_dataset(phonetic variations)->sn

#level 4 - 2 levels of variations, error length controllable based on distribution (on by_country and cn)
se kte
ab kte
wj kte
wm kte
tse kte
he(only_cn) kte

#level 5 - 2 levels of variations, error length non controllable
ocr->sn tse
ocr->sn kte
nicknames_dataset kte
transliteration_dataset(phonetic variations)->sn kte
transliteration_dataset(phonetic variations)->sn tse

### Generate name mismatches

In [80]:
train_names_by_country = train_names_by_country[['name', 'country', 'FirstName', 'LastName']].rename(columns={'name': 'name1'})
val_names_by_country = val_names_by_country[['name', 'country', 'FirstName', 'LastName']].rename(columns={'name': 'name1'})

In [98]:
# level 1 - random permutation
train_names_random = train_names_by_country.copy()
train_names_random['name2'] = train_names_random['name1'].sample(frac=1, random_state=42).values
train_names_random['mismatch'] = 'random'
val_names_random = val_names_by_country.copy()
val_names_random['name2'] = val_names_random['name1'].sample(frac=1, random_state=42).values
val_names_random['mismatch'] = 'random'

In [99]:
# level 2 - random permutation within country
train_names_random_in_country = train_names_by_country.copy()
train_names_random_in_country['name2'] = train_names_random_in_country.groupby('country')['name1'].transform(lambda x: x.sample(frac=1, random_state=42).values)
train_names_random_in_country['mismatch'] = 'random_same_country'
val_names_random_in_country = val_names_by_country.copy()
val_names_random_in_country['name2'] = val_names_random_in_country.groupby('country')['name1'].transform(lambda x: x.sample(frac=1, random_state=42).values)
val_names_random_in_country['mismatch'] = 'random_same_country'

In [100]:
# level 3 (I) - random permutation within country and same Firstname
train_names_random_in_country_same_firstname = train_names_by_country.copy()
train_names_random_in_country_same_firstname['name2'] = train_names_random_in_country_same_firstname.groupby(['country', 'FirstName'])['name1'].transform(lambda x: x.sample(frac=1, random_state=42).values)
train_names_random_in_country_same_firstname['mismatch'] = 'random_same_country_same_firstname'
val_names_random_in_country_same_firstname = val_names_by_country.copy()
val_names_random_in_country_same_firstname['name2'] = val_names_random_in_country_same_firstname.groupby(['country', 'FirstName'])['name1'].transform(lambda x: x.sample(frac=1, random_state=42).values)
val_names_random_in_country_same_firstname['mismatch'] = 'random_same_country_same_firstname'

In [101]:
# level 3 (II) - random permutation within country and same Lastname
train_names_random_in_country_same_lastname = train_names_by_country.copy()
train_names_random_in_country_same_lastname['name2'] = train_names_random_in_country_same_lastname.groupby(['country', 'LastName'])['name1'].transform(lambda x: x.sample(frac=1, random_state=42).values)
train_names_random_in_country_same_lastname['mismatch'] = 'random_same_country_same_lastname'
val_names_random_in_country_same_lastname = val_names_by_country.copy()
val_names_random_in_country_same_lastname['name2'] = val_names_random_in_country_same_lastname.groupby(['country', 'LastName'])['name1'].transform(lambda x: x.sample(frac=1, random_state=42).values)
val_names_random_in_country_same_lastname['mismatch'] = 'random_same_country_same_lastname'

In [105]:
train_names_mismatch = pd.concat([train_names_random, train_names_random_in_country, train_names_random_in_country_same_firstname, train_names_random_in_country_same_lastname]).reset_index(drop=True)[['name1', 'name2', 'mismatch']]
val_names_mismatch = pd.concat([val_names_random, val_names_random_in_country, val_names_random_in_country_same_firstname, val_names_random_in_country_same_lastname]).reset_index(drop=True)[['name1', 'name2', 'mismatch']]

In [112]:
train_names_mismatch.to_csv('train_names_mismatches.csv')
val_names_mismatch.to_csv('val_names_mismatches.csv')