In [1]:
# Import necessary packages.
import jellyfish
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
# Define the function to break names into features.
def Vectorize(string, nparray=True):
    '''
    Takes a string and converts it into a numpy array of features.
    
    Paramters
    ---------
    string  -  The string to be converted.
    nparray -  Whether to return a list or an numpy array.
    
    Returns
    -------
    Either a list or a numpy array of features.
    '''
    # Ensure the string is lowercase.
    string = string.lower()
    
    # Initialize the array.
    vals = [0]*34
    
    # Get the length of the string.
    vals[0] = len(string)
    
    # Get the number of each letter.
    for x, y in enumerate('abcdefghijklmnopqrstuvwxyz'):
        vals[x+1] = string.count(y)
    
    # Define a mapping for letters to numbers.
    mapping = {'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4, 'e' : 5, 'f' : 6, 'g' : 7, 'h' : 8, 'i' : 9, 'j' : 10, 'k' : 11, 'l' : 12,
                  'm' : 13, 'n' : 14, 'o' : 15, 'p' : 16, 'q' : 17, 'r' : 18, 's' : 19, 't' : 20, 'u' : 21, 'v' : 22, 'w' : 23,
                  'x' : 24, 'y' : 25, 'z' : 26}
    
    # Map the first three and last letters to numbers.
    vals[27] = mapping[string[0]]
    vals[28] = mapping[string[1]]
    vals[29] = mapping[string[2]]
    vals[30] = mapping[string[-1]]
    
    # Get the soundex value.
    sound = jellyfish.soundex(string)
    
    # Convert the soundex value to a number.
    vals[31] = (mapping[sound[0].lower()]-1)*7*7*7 + int(sound[1])*7*7 + int(sound[2])*7 + (int(sound[3])+1)
    
    # Initialize the number of vowels and consonants.
    vowels = 0
    const = 0
    
    # Count the number of vowels and consonants.
    for x in string:
        if x in 'aeiouy':
            vowels += 1
        else:
            const += 1
            
    vals[32] = vowels
    vals[33] = const
    
    # Normalize the data.
    #for x in range(len(vals)):
    #    vals[x] = vals[x]/(10**(len(str(vals[x]))))
    
    # Return the values.
    if nparray == True:
        return np.array(vals)
    else:
        return list(vals)

In [3]:
# Read in the data.
df = pd.DataFrame([['Catherine', 'Catharine', True], ['Catherine', 'Katie', True], ['Catherine', 'Kate', True]], columns=['name1', 'name2', 'match'])

In [4]:
# Vectorize the names.
df['name1vec'] = df.name1.apply(Vectorize)
df['name2vec'] = df.name2.apply(Vectorize)

In [5]:
# Get the features.
feat = []
for x in range(len(df.name1vec.iloc[0])):
    # Initialize the column.
    df['feature{}'.format(x)] = 0
    
    # Make a list of feature names.
    feat.append('feature{}'.format(x))

In [6]:
# Assign the features.
df[feat]= list((df.name1vec.values - df.name2vec.values)**2)

In [7]:
# Print the data frame.
df.head(10)

Unnamed: 0,name1,name2,match,name1vec,name2vec,feature0,feature1,feature2,feature3,feature4,...,feature24,feature25,feature26,feature27,feature28,feature29,feature30,feature31,feature32,feature33
0,Catherine,Catharine,True,"[9, 1, 0, 1, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...","[9, 2, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...",0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Catherine,Katie,True,"[9, 1, 0, 1, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...","[5, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...",16,0,0,1,0,...,0,0,0,64,0,0,0,7273809,1,9
2,Catherine,Kate,True,"[9, 1, 0, 1, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...","[4, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",25,0,0,1,0,...,0,0,0,64,0,0,0,7273809,4,9
