# Name Screening
## Detect 50 Bad actors in our customer base using public data sources
**Task**: Find as many bad actors as possible using NLP techniques to match customer names with watchlist and other relevant information given.

In [22]:
#!pip install fuzzywuzzy
#!pip install python-Levenshtein

!pip install rapidfuzz

import pandas as pd



In [2]:
# load in relevant datasets
df_kyc = pd.read_csv("UofT_nodes.csv")
df_bad = pd.read_csv("targets.simple.csv", low_memory = False)

# retain relevant columns
df_kyc_cleaned = df_kyc[['NAME','BIRTH_DT']]
df_bad_cleaned = df_bad[['name','birth_date']]

In [3]:
# remove entried where name is null for both datasets
df_kyc_cleaned = df_kyc_cleaned[~df_kyc_cleaned['NAME'].isnull()].reset_index(drop=True)
df_bad_cleaned = df_bad_cleaned[~df_bad_cleaned['name'].isnull()].reset_index(drop=True)

In [4]:
# standardize the format of the birth date columns
df_kyc_cleaned['BIRTH_DT_CLEANED'] = pd.to_datetime(df_kyc_cleaned['BIRTH_DT'])
df_bad_cleaned['birth_date_cleaned'] = pd.to_datetime(df_bad_cleaned['birth_date'].str.\
                                                      split(';', n=1, expand = True)[0],
                                                      errors = 'coerce')

In [5]:
df_bad_cleaned.shape

(262426, 3)

In [6]:
df_kyc_cleaned.shape

(999340, 3)

## Method 1: FuzzyWuzzy

In [7]:
from fuzzywuzzy import fuzz

df_bad_cust = pd.DataFrame(columns = ['index_kyc', 'name_kyc', 'index_bad', 'name_bad'])

for ind_cust, row_cust in df_kyc_cleaned.iloc[1653:,:].iterrows():
    if ind_cust % 100000 == 0:
        print(f"At customer index {ind_cust}")
        
    for ind_bad, row_bad in df_bad_cleaned.iterrows():
        if fuzz.token_sort_ratio(row_bad['name'], row_cust['NAME']) >= 80:
            df_bad_cust = df_bad_cust.append({'index_kyc' : ind_cust,
                                              'name_kyc' : row_cust['NAME'],
                                              'index_bad' :  ind_bad,
                                              'name_bad' : row_bad['name'],
                                             },ignore_index = True)
            print(f"Customer Name: {row_cust['NAME']}, Bad Actor Name: {row_bad['name']}")
        else:
            pass

In [20]:
row_cust

NAME                Ronald Anthony eestlr
BIRTH_DT                       1961-06-29
BIRTH_DT_CLEANED      1961-06-29 00:00:00
Name: 1909, dtype: object

In [21]:
ind_cust

1909

## Method 2: RapidFuzz

In [61]:
import pandas as pd, numpy as np
import rapidfuzz
from rapidfuzz import process, utils
import time

processed_bad_actors = [utils.default_process(name) for name in df_bad_cleaned['name']]
choices_dict = {idx: el for idx, el in enumerate(processed_bad_actors)}
threshold = 90

def find_match(x):
    match = process.extractOne(x, choices_dict, scorer=rapidfuzz.fuzz.token_sort_ratio, score_cutoff = 90)
    if match is None:
        match = np.nan
    else: 
        match = match if match[1]>threshold else np.nan
    return match

In [62]:
df_kyc_rf = df_kyc_cleaned.copy()

In [63]:
start = time.time()
df_kyc_rf['match found','score','index_position'] = df_kyc_rf['NAME'].apply(find_match)
end = time.tim()

KeyboardInterrupt: 

In [64]:
df_kyc_rf

Unnamed: 0,NAME,BIRTH_DT,BIRTH_DT_CLEANED
0,"Young, Marie Mildren Coleman",1981-09-01,1981-09-01
1,Mark Stupar Lecy,1994-02-21,1994-02-21
2,Dean Glasper Wendel Reeves,1962-11-16,1962-11-16
3,"Hulsey, Linda Rauth",1998-06-20,1998-06-20
4,Carolyn Washington Roberts,1942-01-24,1942-01-24
...,...,...,...
999335,Bret* Dove Gainey,1939-01-16,1939-01-16
999336,Anthony Ray Montano Gomez,1970-12-13,1970-12-13
999337,Bernadette Schofield Pace Mile,1958-09-09,1958-09-09
999338,"Wasson, Russell Terrell",1983-09-15,1983-09-15


In [None]:
# Apply exact match first

# Then apply fuzzy match