# Name Screening
## Detect 50 Bad actors in our customer base using public data sources
**Task**: Find as many bad actors as possible using NLP techniques to match customer names with watchlist and other relevant information given.

In [1]:
#!pip install fuzzywuzzy
#!pip install python-Levenshtein
# !pip install rapidfuzz

import pandas as pd

In [2]:
# load in relevant datasets
df_kyc = pd.read_csv("UofT_nodes.csv")
df_bad = pd.read_csv("targets.simple.csv", low_memory = False)

# retain relevant columns
df_kyc_cleaned = df_kyc[['NAME','BIRTH_DT']]
df_bad_cleaned = df_bad[['name','birth_date']]

In [3]:
# remove entried where name and birth_date are null for both datasets
df_kyc_cleaned = df_kyc_cleaned[(~df_kyc_cleaned['NAME'].isnull())&\
                               (~df_kyc_cleaned['BIRTH_DT'].isnull())].reset_index(drop=True)
df_bad_cleaned = df_bad_cleaned[(~df_bad_cleaned['name'].isnull())&\
                               (~df_bad_cleaned['birth_date'].isnull())].reset_index(drop=True)

In [4]:
# standardize the format of the birth date columns
df_kyc_cleaned['BIRTH_DT_CLEANED'] = pd.to_datetime(df_kyc_cleaned['BIRTH_DT'])
df_bad_cleaned['birth_date_cleaned'] = pd.to_datetime(df_bad_cleaned['birth_date'].str.\
                                                      split(';', n=1, expand = True)[0],
                                                      errors = 'coerce')

In [5]:
df_bad_cleaned.shape

(164629, 3)

In [6]:
df_kyc_cleaned.shape

(999340, 3)

In [8]:
df_bad_cleaned

Unnamed: 0,name,birth_date,birth_date_cleaned
0,Kolyvanov Egor,1980-11-15,1980-11-15
1,Shipov Sergei Yurievich,1966-04-17,1966-04-17
2,Egorov Ivan Mikhailovich,1961-01-21,1961-01-21
3,Goreslavsky Alexey Sergeyevich,1977-07-13,1977-07-13
4,Samoilova Natalya Vladimirovna,1987-06-24,1987-06-24
...,...,...,...
164624,YUAN WEIDONG,1967-10-08,1967-10-08
164625,YURIY SERGEYEVICH ANDRIENKO,1988-05-30,1988-05-30
164626,YURY YEVGENYEVICH SAVIN,1975-01-12,1975-01-12
164627,ZHANG HAORAN,1985-06-15,1985-06-15


## Method 1: FuzzyWuzzy

In [None]:
from fuzzywuzzy import fuzz

df_bad_cust = pd.DataFrame(columns = ['index_kyc', 'name_kyc', 'index_bad', 'name_bad'])

for ind_cust, row_cust in df_kyc_cleaned.iloc[1653:,:].iterrows():
    if ind_cust % 100000 == 0:
        print(f"At customer index {ind_cust}")
        
    for ind_bad, row_bad in df_bad_cleaned.iterrows():
        if fuzz.token_sort_ratio(row_bad['name'], row_cust['NAME']) >= 80:
            df_bad_cust = df_bad_cust.append({'index_kyc' : ind_cust,
                                              'name_kyc' : row_cust['NAME'],
                                              'index_bad' :  ind_bad,
                                              'name_bad' : row_bad['name'],
                                             },ignore_index = True)
            print(f"Customer Name: {row_cust['NAME']}, Bad Actor Name: {row_bad['name']}")
        else:
            pass

In [None]:
row_cust

In [None]:
ind_cust

## Method 2: RapidFuzz

In [None]:
import pandas as pd, numpy as np
import rapidfuzz
from rapidfuzz import process, utils
import time

processed_bad_actors = [utils.default_process(name) for name in df_bad_cleaned['name']]
choices_dict = {idx: el for idx, el in enumerate(processed_bad_actors)}
threshold = 90

def find_match(x):
    match = process.extractOne(x, choices_dict, scorer=rapidfuzz.fuzz.token_sort_ratio, score_cutoff = 90)
    if match is None:
        match = np.nan
    else: 
        match = match if match[1]>threshold else np.nan
    return match

In [None]:
df_kyc_rf = df_kyc_cleaned.copy()

In [None]:
start = time.time()
df_kyc_rf['match found','score','index_position'] = df_kyc_rf['NAME'].apply(find_match)
end = time.tim()

In [None]:
df_kyc_rf

## Method 3: Subsets
- Group by Birth Dates, Search within Birth Date Groups

In [12]:
len(df_kyc_cleaned['BIRTH_DT'].unique())

37384

In [13]:
len(df_bad_cleaned['birth_date'].unique())

29838