## Feature Creation

This file is intended to take a data frame of compared names and give numeric features for our model.

In [1]:
# Import necessary packages.
import jellyfish
import numpy as np
import pandas as pd
import re
import unicodedata
from sklearn.metrics.pairwise import euclidean_distances
from random import randint

We need two functions particularly, one function to turn a name into a vector of features and one function to normalize all of the values in the vector. We normalize the values by taking the maximum at each index in the vector across all of the vectors, including both the name and the match. This ultimately gives us values between 0 and 1.

In [2]:
# Define the function to break names into features.
def VectorizeName(string, nparray=True):
    '''
    Takes a string and converts it into a numpy array of features. This assumes a name of length 3,
    namely "first middle last". If you pass only one thing, it assumes a first name. Two things, first and last.
    If you pass more than three things, it will take the first and the last words then combine all words in the
    middle into one.
    
    Paramters
    ---------
    string  -  The name to be converted.
    nparray -  Whether to return a list or an numpy array.
    
    Returns
    -------
    Either a list or a numpy array of features.
    '''
    # Ensure the string is lowercase.
    string = string.lower()
    
    # Clean out any numbers.
    string = re.sub('\d', '', string)
    string = re.sub('_', '', string)
    
    # Clean out accents, tildes, etc.
    string = ''.join((c for c in unicodedata.normalize('NFD', u'{}'.format(string)) if unicodedata.category(c) != 'Mn'))
    
    # Clean out non-ascii characters.
    string = string.encode('ascii', errors='ignore').decode()
    
    # Initialize the array.
    vals = []
    
    # Get each value.
    allwords = re.findall('\w+', string)
    if len(allwords) == 0:
        allwords = [u'']
    
    # Assign first name.
    first = allwords[0]
    
    # Assign last name if not the same as first.
    if first != allwords[-1]:
        last = allwords[-1]
    else:
        last = u''
    
    # Assign middle name, smashing all words in between first and last together.
    middle = u''
    for x in allwords[1:-1]:
        middle = middle + x
        
    # Define a mapping for letters to numbers.
    mapping = {'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4, 'e' : 5, 'f' : 6, 'g' : 7, 'h' : 8, 'i' : 9, 'j' : 10, 'k' : 11, 'l' : 12,
                  'm' : 13, 'n' : 14, 'o' : 15, 'p' : 16, 'q' : 17, 'r' : 18, 's' : 19, 't' : 20, 'u' : 21, 'v' : 22, 'w' : 23,
                  'x' : 24, 'y' : 25, 'z' : 26}
    
    # Loop over the various names
    for name in [first, middle, last]:
        # If nothing for any of the names, return zeros.
        #if len(name) == 0:
        #    for x in range(34):
        #        vals.append(0)
        #    
        #    continue
        
        # Get the length of the string.
        vals.append(len(name))

        # Get the number of each letter.
        for x, y in enumerate('abcdefghijklmnopqrstuvwxyz'):
            vals.append(name.count(y))

        # Map the first three and last letters to numbers.
        if len(name) >= 3:
            vals.append(mapping[name[0]])
            vals.append(mapping[name[1]])
            vals.append(mapping[name[2]])
            vals.append(mapping[name[-1]])
        elif len(name) == 2:
            vals.append(mapping[name[0]])
            vals.append(mapping[name[1]])
            vals.append(randint(1,26))
            vals.append(mapping[name[-1]]) 
        elif len(name) == 1:
            vals.append(mapping[name[0]])
            vals.append(randint(1,26))
            vals.append(randint(1,26))
            vals.append(mapping[name[-1]])
        else:
            vals.append(randint(1,26))
            vals.append(randint(1,26))
            vals.append(randint(1,26))
            vals.append(randint(1,26))
            

        # Get the soundex value.
        sound = jellyfish.soundex(u'{}'.format(name))

        # Convert the soundex value to a number.
        if sound != '':
            vals.append((mapping[sound[0].lower()]-1)*7*7*7 + int(sound[1])*7*7 + int(sound[2])*7 + (int(sound[3])+1))
        else:
            vals.append(randint(1,8576))
        
        # Initialize the number of vowels and consonants.
        vowels = 0
        const = 0

        # Count the number of vowels and consonants.
        for x in name:
            if x in 'aeiouy':
                vowels += 1
            else:
                const += 1

        vals.append(vowels)
        vals.append(const)
        
    # Get the number of words.
    if allwords != [u'']:
        vals.append(len(allwords))
    else:
        vals.append(0)

    # Normalize the data.
    #for x in range(len(vals)):
    #    vals[x] = vals[x]/(10**(len(str(vals[x]))))
    
    # Return the values.
    if nparray == True:
        return np.array(vals)
    else:
        return list(vals)

In [3]:
def Normalize(vec1, vec2):
    '''
    Normalize each element of the vectors based on the maximum value at that spot in the row anywhere in either matrix.
    Both parameters should be pandas columns of lists.
    
    Parameters
    ----------
    vec1  -  First column of values.
    vec2  -  Second column of values.
    
    Returns
    -------
    Two numpy arrays.
    '''
    # Zip the lists together correctly.
    vec1 = list(zip(*vec1))
    vec2 = list(zip(*vec2))
    
    # Get the maximum and divide the values.
    for x in range(len(vec1)):
        max_val = float(np.array((vec1[x] + vec2[x])).max())
        vec1[x] = [y/max_val for y in vec1[x]]
        vec2[x] = [y/max_val for y in vec2[x]]
        
    # Get the numpy arrays.
    arr1 = [np.array(x) for x in list(zip(*vec1))]
    arr2 = [np.array(x) for x in list(zip(*vec2))]
    
    # Return the arrays.
    return arr1, arr2

Here is one example of what the vectors look like, before the values are normalized.

In [4]:
# Show an example.
VectorizeName(u'Tanner Scott Eastmond')

array([   6,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0,
          0,    0,    0,    2,    0,    0,    0,    1,    0,    1,    0,
          0,    0,    0,    0,    0,   20,    1,   14,   18, 6805,    2,
          4,    5,    0,    0,    1,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    1,    0,    0,    0,    1,    2,
          0,    0,    0,    0,    0,    0,   19,    3,   15,   20, 6322,
          1,    4,    8,    1,    0,    0,    1,    1,    0,    0,    0,
          0,    0,    0,    0,    1,    1,    1,    0,    0,    0,    1,
          1,    0,    0,    0,    0,    0,    0,    5,    1,   19,    4,
       1497,    3,    5,    3])

We built the true and false names separately, so we need to read them in and append them together.

In [5]:
# Read in the data.
df = pd.read_csv(r'C:\Users\tanne\Downloads\true_names.csv', encoding='utf-8', nrows=100000)
false = pd.read_csv(r'C:\Users\tanne\Downloads\false_names2.csv', encoding='utf-8', nrows=100000)

# Keep the correct columns for false and rename.
false = false[['fsid', 'full1', 'first1', 'mid1', 'last1', 'full2', 'first2', 'mid2', 'last2', 'match']]
false.columns = ['fsid', 'Full1', 'First1', 'Mid1', 'Last1', 'Full2', 'First2', 'Mid2', 'Last2', 'Match']

# Append the data together.
df = df.append(false, ignore_index=True)

In [6]:
# Print the data.
df.head()

Unnamed: 0,fsid,Full1,First1,Mid1,Last1,Full2,First2,Mid2,Last2,Match
0,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Catharine Englehart,Catharine,,Englehart,1
1,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Katie Englehart,Katie,,Englehart,1
2,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Philip E Fuller,Philip,E,Fuller,1
3,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Phillip E Fuller,Phillip,E,Fuller,1
4,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Philip E Fuller,Philip,E,Fuller,1


First we will clean out 'nan' values and replace them with an empty string, then convert the names to vectors and normalize the values.

In [7]:
# Get rid of missing values.
for x in ['Full1', 'Full2', 'First1', 'First2', 'Mid1', 'Mid2', 'Last1', 'Last2']:
    df[x] = df[x].fillna('')

In [8]:
# Vectorize the names.
df['Full1vec'] = df.Full1.apply(VectorizeName)
df['Full2vec'] = df.Full2.apply(VectorizeName)

In [9]:
# Normalize the name vectors.
df['Full1vec'], df['Full2vec'] = Normalize(df['Full1vec'], df['Full2vec'])

Since our features will ultimately be the squared distances between each element of the two vectors, we will initialize the columns and keep track of the names to assign the distances below.

In [10]:
# Get the features.
fullfeat = []

for x in range(len(df.Full1vec.iloc[0])):
    # Initialize the column.
    df['FullFeature{}'.format(x)] = 0    
    
    # Make a list of feature names.
    fullfeat.append('FullFeature{}'.format(x))

In [11]:
# Assign the features.
df[fullfeat] = list((df.Full1vec.values - df.Full2vec.values)**2)

In [12]:
# Print the data frame.
df.head(5)

Unnamed: 0,fsid,Full1,First1,Mid1,Last1,Full2,First2,Mid2,Last2,Match,...,FullFeature93,FullFeature94,FullFeature95,FullFeature96,FullFeature97,FullFeature98,FullFeature99,FullFeature100,FullFeature101,FullFeature102
0,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Catharine Englehart,Catharine,,Englehart,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Katie Englehart,Katie,,Englehart,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Philip E Fuller,Philip,E,Fuller,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Phillip E Fuller,Phillip,E,Fuller,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Philip E Fuller,Philip,E,Fuller,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Lastly we will save the data for use with our model.

In [13]:
# Keep the columns of interest and save the data.
df = df[['fsid', 'Match'] + [col for col in df.columns if 'Full' in col]]
df.to_csv(r'C:\Users\tanne\Downloads\names_final.csv', index=False, encoding='utf-8')