## Feature Creation

This file is intended to take a data frame of compared names and give numeric features for our model.

In [1]:
# Import necessary packages.
import jellyfish
import numpy as np
import pandas as pd
import re
from sklearn.metrics.pairwise import euclidean_distances

In [8]:
# Define the function to break names into features.
def VectorizeName(string, nparray=True):
    '''
    Takes a string and converts it into a numpy array of features. This assumes a name of length 3,
    namely "first middle last". If you pass only one thing, it assumes a first name. Two things, first and last.
    If you pass more than three things, it will take the first and the last words then combine all words in the
    middle into one.
    
    Paramters
    ---------
    string  -  The name to be converted.
    nparray -  Whether to return a list or an numpy array.
    
    Returns
    -------
    Either a list or a numpy array of features.
    '''
    # Ensure the string is lowercase.
    string = string.lower()
    
    # Clean out any numbers.
    string = re.sub('\d', '', string)
    string = re.sub('_', '', string)
    
    # Initialize the array.
    vals = []
    
    # Get each value.
    allwords = re.findall('\w+', string)
    if len(allwords) == 0:
        allwords = ['']
    
    # Assign first name.
    first = allwords[0]
    
    # Assign last name if not the same as first.
    if first != allwords[-1]:
        last = allwords[-1]
    else:
        last = ''
    
    # Assign middle name, smashing all words in between first and last together.
    middle = ''
    for x in allwords[1:-1]:
        middle = middle + x
        
    # Define a mapping for letters to numbers.
    mapping = {'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4, 'e' : 5, 'f' : 6, 'g' : 7, 'h' : 8, 'i' : 9, 'j' : 10, 'k' : 11, 'l' : 12,
                  'm' : 13, 'n' : 14, 'o' : 15, 'p' : 16, 'q' : 17, 'r' : 18, 's' : 19, 't' : 20, 'u' : 21, 'v' : 22, 'w' : 23,
                  'x' : 24, 'y' : 25, 'z' : 26}
    
    # Loop over the various names
    for name in [first, middle, last]:
        # If nothing for any of the names, return zeros.
        if len(name) == 0:
            for x in range(34):
                vals.append(0)
            
            continue
        
        # Get the length of the string.
        vals.append(len(name))

        # Get the number of each letter.
        for x, y in enumerate('abcdefghijklmnopqrstuvwxyz'):
            vals.append(name.count(y))

        # Map the first three and last letters to numbers.
        if len(name) >= 3:
            vals.append(mapping[name[0]])
            vals.append(mapping[name[1]])
            vals.append(mapping[name[2]])
            vals.append(mapping[name[-1]])
        elif len(name) == 2:
            vals.append(mapping[name[0]])
            vals.append(mapping[name[1]])
            vals.append(0)
            vals.append(mapping[name[-1]]) 
        elif len(name) == 1:
            vals.append(mapping[name[0]])
            vals.append(0)
            vals.append(0)
            vals.append(mapping[name[-1]])

        # Get the soundex value.
        sound = jellyfish.soundex(name)

        # Convert the soundex value to a number.
        vals.append((mapping[sound[0].lower()]-1)*7*7*7 + int(sound[1])*7*7 + int(sound[2])*7 + (int(sound[3])+1))

        # Initialize the number of vowels and consonants.
        vowels = 0
        const = 0

        # Count the number of vowels and consonants.
        for x in name:
            if x in 'aeiouy':
                vowels += 1
            else:
                const += 1

        vals.append(vowels)
        vals.append(const)
        
    # Get the number of words.
    vals.append(len(allwords))

    # Normalize the data.
    #for x in range(len(vals)):
    #    vals[x] = vals[x]/(10**(len(str(vals[x]))))
    
    # Return the values.
    if nparray == True:
        return np.array(vals)
    else:
        return list(vals)

In [3]:
# Show an example.
VectorizeName('Tanner Scott Eastmond')

array([   6,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0,
          0,    0,    0,    2,    0,    0,    0,    1,    0,    1,    0,
          0,    0,    0,    0,    0,   20,    1,   14,   18, 6805,    2,
          4,    5,    0,    0,    1,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    1,    0,    0,    0,    1,    2,
          0,    0,    0,    0,    0,    0,   19,    3,   15,   20, 6322,
          1,    4,    8,    1,    0,    0,    1,    1,    0,    0,    0,
          0,    0,    0,    0,    1,    1,    1,    0,    0,    0,    1,
          1,    0,    0,    0,    0,    0,    0,    5,    1,   19,    4,
       1497,    3,    5,    3])

In [4]:
# Read in the data.
#df = pd.DataFrame([['Catherine', 'Catharine', True], ['Catherine', 'Katie', True], ['Catherine', 'Kate', True]], columns=['name1', 'name2', 'match'])
df = pd.read_csv(r'C:\Users\tanne\downloads\api_scrape_clean.csv')

In [5]:
# Print the data.
df.head()

Unnamed: 0,fsid,Full1,First1,Mid1,Last1,Full2,First2,Mid2,Last2,Match
0,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Catharine Englehart,Catharine,,Englehart,1
1,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Katie Englehart,Katie,,Englehart,1
2,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Kate Wagner,Kate,,Wagner,1
3,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Philip E Fuller,Philip,E,Fuller,1
4,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Phillip E Fuller,Phillip,E,Fuller,1


In [6]:
# Get rid of missing values.
for x in ['Full1', 'Full2', 'First1', 'First2', 'Mid1', 'Mid2', 'Last1', 'Last2']:
    df[x] = df[x].fillna('')

In [9]:
# Vectorize the names.
df['Full1vec'] = df.Full1.apply(VectorizeName)
df['Full2vec'] = df.Full2.apply(VectorizeName)

df['First1vec'] = df.First1.apply(VectorizeName)
df['First2vec'] = df.First2.apply(VectorizeName)

df['Mid1vec'] = df.Mid1.apply(VectorizeName)
df['Mid2vec'] = df.Mid2.apply(VectorizeName)

df['Last1vec'] = df.Last1.apply(VectorizeName)
df['Last2vec'] = df.Last2.apply(VectorizeName)

In [10]:
# Get the features.
fullfeat = []
firstfeat = []
midfeat = []
lastfeat = []

for x in range(len(df.Full1vec.iloc[0])):
    # Initialize the column.
    df['FullFeature{}'.format(x)] = 0    
    df['FirstFeature{}'.format(x)] = 0 
    df['MidFeature{}'.format(x)] = 0 
    df['LastFeature{}'.format(x)] = 0 
    
    # Make a list of feature names.
    fullfeat.append('FullFeature{}'.format(x))
    firstfeat.append('FirstFeature{}'.format(x))
    midfeat.append('MidFeature{}'.format(x))
    lastfeat.append('LastFeature{}'.format(x))

In [11]:
# Assign the features.
df[fullfeat]= list((df.Full1vec.values - df.Full2vec.values)**2)
df[firstfeat]= list((df.First1vec.values - df.First2vec.values)**2)
df[midfeat]= list((df.Mid1vec.values - df.Mid2vec.values)**2)
df[lastfeat]= list((df.Last1vec.values - df.Last2vec.values)**2)

In [12]:
# Print the data frame.
df.head(10)

Unnamed: 0,fsid,Full1,First1,Mid1,Last1,Full2,First2,Mid2,Last2,Match,...,MidFeature100,LastFeature100,FullFeature101,FirstFeature101,MidFeature101,LastFeature101,FullFeature102,FirstFeature102,MidFeature102,LastFeature102
0,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Catharine Englehart,Catharine,,Englehart,1,...,0,0,0,0,0,0,0,0,0,0
1,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Katie Englehart,Katie,,Englehart,1,...,0,0,0,0,0,0,0,0,0,0
2,LDBJ-136,Catherine Englehart,Catherine,,Englehart,Kate Wagner,Kate,,Wagner,1,...,0,0,4,0,0,0,0,0,0,0
3,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Philip E Fuller,Philip,E,Fuller,1,...,0,0,0,0,0,0,0,0,0,0
4,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Phillip E Fuller,Phillip,E,Fuller,1,...,0,0,0,0,0,0,0,0,0,0
5,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Philip E Fuller,Philip,E,Fuller,1,...,0,0,0,0,0,0,0,0,0,0
6,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Philip Fuller,Philip,,Fuller,1,...,0,0,0,0,0,0,1,0,0,0
7,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Phillip E Fuller,Phillip,E,Fuller,1,...,0,0,0,0,0,0,0,0,0,0
8,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Philip E Fuller,Philip,E,Fuller,1,...,0,0,0,0,0,0,0,0,0,0
9,LR4C-C64,Philip Ely Fuller,Philip,Ely,Fuller,Phillip E Fuller,Phillip,E,Fuller,1,...,0,0,0,0,0,0,0,0,0,0
