## Feature Creation

This file is intended to take a data frame of compared names and give numeric features for our model.

In [4]:
# Import necessary packages.
import jellyfish
import numpy as np
import pandas as pd
import re
from sklearn.metrics.pairwise import euclidean_distances

In [37]:
# Define the function to break names into features.
def VectorizeName(string, nparray=True):
    '''
    Takes a string and converts it into a numpy array of features. This assumes a name of length 3,
    namely "first middle last". If you pass only one thing, it assumes a first name. Two things, first and last.
    If you pass more than three things, it will take the first and the last words then combine all words in the
    middle into one.
    
    Paramters
    ---------
    string  -  The name to be converted.
    nparray -  Whether to return a list or an numpy array.
    
    Returns
    -------
    Either a list or a numpy array of features.
    '''
    # Ensure the string is lowercase.
    string = string.lower()
    
    # Initialize the array.
    vals = []
    
    # Get each value.
    allwords = re.findall('\w+', string)
    
    # Assign first name.
    first = allwords[0]
    
    # Assign last name if not the same as first.
    if first != allwords[-1]:
        last = allwords[-1]
    else:
        last = ''
    
    # Assign middle name, smashing all words in between first and last together.
    middle = ''
    for x in allwords[1:-1]:
        middle = middle + x
        
    # Define a mapping for letters to numbers.
    mapping = {'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4, 'e' : 5, 'f' : 6, 'g' : 7, 'h' : 8, 'i' : 9, 'j' : 10, 'k' : 11, 'l' : 12,
                  'm' : 13, 'n' : 14, 'o' : 15, 'p' : 16, 'q' : 17, 'r' : 18, 's' : 19, 't' : 20, 'u' : 21, 'v' : 22, 'w' : 23,
                  'x' : 24, 'y' : 25, 'z' : 26}
    
    # Loop over the various names
    for name in [first, middle, last]:
        # If nothing for any of the names, return zeros.
        if len(name) == 0:
            for x in range(34):
                vals.append(0)
            
            continue
        
        # Get the length of the string.
        vals.append(len(name))

        # Get the number of each letter.
        for x, y in enumerate('abcdefghijklmnopqrstuvwxyz'):
            vals.append(name.count(y))

        # Map the first three and last letters to numbers.
        vals.append(mapping[name[0]])
        vals.append(mapping[name[1]])
        vals.append(mapping[name[2]])
        vals.append(mapping[name[-1]])

        # Get the soundex value.
        sound = jellyfish.soundex(name)

        # Convert the soundex value to a number.
        vals.append((mapping[sound[0].lower()]-1)*7*7*7 + int(sound[1])*7*7 + int(sound[2])*7 + (int(sound[3])+1))

        # Initialize the number of vowels and consonants.
        vowels = 0
        const = 0

        # Count the number of vowels and consonants.
        for x in name:
            if x in 'aeiouy':
                vowels += 1
            else:
                const += 1

        vals.append(vowels)
        vals.append(const)
        
    # Get the number of words.
    vals.append(len(allwords))

    # Normalize the data.
    #for x in range(len(vals)):
    #    vals[x] = vals[x]/(10**(len(str(vals[x]))))
    
    # Return the values.
    if nparray == True:
        return np.array(vals)
    else:
        return list(vals)

In [40]:
# Show an example.
VectorizeName('Tanner Scott Eastmond')

array([   6,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0,
          0,    0,    0,    2,    0,    0,    0,    1,    0,    1,    0,
          0,    0,    0,    0,    0,   20,    1,   14,   18, 6805,    2,
          4,    5,    0,    0,    1,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    1,    0,    0,    0,    1,    2,
          0,    0,    0,    0,    0,    0,   19,    3,   15,   20, 6322,
          1,    4,    8,    1,    0,    0,    1,    1,    0,    0,    0,
          0,    0,    0,    0,    1,    1,    1,    0,    0,    0,    1,
          1,    0,    0,    0,    0,    0,    0,    5,    1,   19,    4,
       1497,    3,    5,    3])

In [3]:
# Read in the data.
df = pd.DataFrame([['Catherine', 'Catharine', True], ['Catherine', 'Katie', True], ['Catherine', 'Kate', True]], columns=['name1', 'name2', 'match'])

In [4]:
# Vectorize the names.
df['name1vec'] = df.name1.apply(Vectorize)
df['name2vec'] = df.name2.apply(Vectorize)

In [5]:
# Get the features.
feat = []
for x in range(len(df.name1vec.iloc[0])):
    # Initialize the column.
    df['feature{}'.format(x)] = 0
    
    # Make a list of feature names.
    feat.append('feature{}'.format(x))

In [6]:
# Assign the features.
df[feat]= list((df.name1vec.values - df.name2vec.values)**2)

In [7]:
# Print the data frame.
df.head(10)

Unnamed: 0,name1,name2,match,name1vec,name2vec,feature0,feature1,feature2,feature3,feature4,...,feature24,feature25,feature26,feature27,feature28,feature29,feature30,feature31,feature32,feature33
0,Catherine,Catharine,True,"[9, 1, 0, 1, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...","[9, 2, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...",0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Catherine,Katie,True,"[9, 1, 0, 1, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...","[5, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...",16,0,0,1,0,...,0,0,0,64,0,0,0,7273809,1,9
2,Catherine,Kate,True,"[9, 1, 0, 1, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...","[4, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",25,0,0,1,0,...,0,0,0,64,0,0,0,7273809,4,9
