# FUZZY MATCHING
The problem this file is trying to solve is matching up names that have been manually keyed into a spreadsheet to name that are the "offical" version in company HR records.  These are referred to as the "Alpha" names.

In [None]:
# import some modules
import os
import pandas as pd
import sqlite3
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
# dbconn = sqlite3.connect(':memory:')
dbconn = sqlite3.connect('pars.db')
print(dbconn)

In [None]:
cursor = dbconn.cursor()
cursor

### Here are the IN files

In [None]:
a_file = r"C:\Users\CHUGHES\Documents\PROJECTS\Python\001-PARS\DATA\HR-ECI-ALLEMP.csv"
p_file = r"C:\Users\CHUGHES\Documents\PROJECTS\Python\001-PARS\DATA\2021-PAR-EDIT.csv"


## Working with PARS file
The PARS file has just the LAST, MIDDLE, and FIRST names.  Not the ALPHA which is a Whole Name (Last, First, MI + suffixes)
Here, I am just interested in the names and then joining them into a LAST, First format.

In [None]:
p_header = {1:"NUMBER",6:"LNAME",8:"FNAME"}

In [None]:
p_header

In [None]:
df1 = pd.read_csv(p_file, index_col=None, header=None, skiprows=1, usecols = [1,6,8])
df1.rename(columns = p_header, inplace=True)

In [None]:
df1.head(5)

### Making sure the data is in UPPER Case

In [None]:
df1['LNAME'] = df1['LNAME'].str.upper()
df1['FNAME'] = df1['FNAME'].str.upper()

In [None]:
df1.head(5)

In [None]:
len(df1.index) # Just testing that the number of rows stays the same.

Below is code that would split a Whole name field.  It is not needed for this file.

In [None]:
# df1['P_LNAME'] = df1.NAME_PACS.str.split(',', expand = True)[0] # Not Needed for this exercise
# df1['P_FNAME'] = df1.NAME_PACS.str.split(' ', expand = True)[2] # Not Needed for this exercise - splits first and last names

In [None]:
df1.head(10)

In [None]:
len(df1.index)  # Just testing that the number of rows stays the same.

In [None]:
df1.dtypes

### Need to change data type to a string for the Lamda function to work

In [None]:
df1 = df1.astype({"LNAME": str}, errors='raise') 
df1 = df1.astype({"FNAME": str}, errors='raise') 

### Joining the LAST_NAME and FIRST_NAME columns with a "," and a SPACE
### and adding the new column to the end

In [None]:
df1['NAME'] = df1[['LNAME','FNAME']].apply(lambda x: ', '.join(x), axis=1)

### will fuzzymatch the "NAME" column with the Aplha Roster "A_NAME"
### column in the other file

In [None]:
df1.head(10)

In [None]:
len(df1.index)  # Just testing that the number of rows stays the same.

___

## Working the APLHA file (HR DATA)
Only interested here in getting the ECI# number and the "official" spelling of the person's name.  

In [None]:
df2 = pd.read_csv(a_file)
df2.head(2)

In [None]:
a_header = {0:"LOA",1:"A_NAME",3:"ECI"}

In [None]:
a_header

In [None]:
df2 = pd.read_csv(a_file, index_col=None, header=None, skiprows=1, usecols = [0,1,3])
df2.rename(columns = a_header, inplace=True)

In [None]:
df2.head(10)

In [None]:
dfunique = len(df2) - df2.nunique()
print (dfunique)

In [None]:
# df2.drop_duplicates(subset=['NAME_ALPHA'],inplace=True)
# df2.drop_duplicates(subset=['NAME_PACS'])
#df2.head(5)

In [None]:
df2 = df2.astype({"A_NAME": str}, errors='raise') 
# df1 = df1.astype({"FNAME": str}, errors='raise') 

In [None]:
# df2['A_NAME'] = df2[['A_LNAME','A_FNAME']].apply(lambda x: ', '.join(x), axis=1)

In [None]:
df2.head(5)

In [None]:
df1.head(5)

___

### Here is where the Fuzzy Matching begins.
Below are 2 lines of test code.

In [None]:
process.extractOne("HUGHES, A", df2['A_NAME'].to_list(), score_cutoff=90)

In [None]:
process.extractOne("HUGHES, C", df1['NAME'].to_list(), score_cutoff=90)

### Below is the actual Fuzzy Match
(This can take a few minutes to finish)

In [None]:
# df1['name_from_df2'] = df1['P_NAME'].apply(lambda x: process.extractOne(x, df2['A_NAME'].to_list(),score_cutoff=90))
# df2['PAR_NAME'] = df2['A_NAME'].apply(lambda x: process.extractOne(x, df1['NAME'].to_list(),score_cutoff=90))
df1['ALPHA_NAME'] = df1['NAME'].apply(lambda x: process.extractOne(x, df2['A_NAME'].to_list(),score_cutoff=90))
# df2.head(5)

In [None]:
df1.head(25)

The code below takes the ALPHA_NAME list and converts it to an actual column in the dataframe.

In [None]:
temp_name_list = df1['ALPHA_NAME'].to_list()
temp_name_list = [_[0] if _ != None else None for _ in temp_name_list]
df1['ALPHA_NAME'] = temp_name_list

In [None]:
df1.head(15)

In [None]:
# df2.drop(columns=['PACR_NAME']) # Ooops

In [None]:
len(df1.index)

___
### Put the dataframe into the database

In [None]:
df1.to_sql('fuzz',con = dbconn, if_exists='replace', index=False)

___
### I used this query in SQLite to match things up
### where 'LOAAlpha is a table with all COMPANY employees
### with the LOA and ECI #'s

```
SELECT f.NUMBER,
       l.LOA,
       l.ECI,
       f.ALPHA_NAME
  FROM fuzz f
       LEFT JOIN
       LOAAlpha L ON f.ALPHA_NAME = l.NAME;

```
and that's it