## Model

Great! Now that we have scraped our data, cleaned it, and prepared the features, we are ready to run a few different models to get the weights we need.

In [1]:
# Import necessary packages.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import unicodedata
from random import randint
import jellyfish
import sys
#sys.path.append(r'R:\JoePriceResearch\Python\Anaconda3\Lib\site-packages')

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [2]:
# Define the function to break names into features.
def VectorizeName(string, nparray=True):
    '''
    Takes a string and converts it into a numpy array of features. This assumes a name of length 3,
    namely "first middle last". If you pass only one thing, it assumes a first name. Two things, first and last.
    If you pass more than three things, it will take the first and the last words then combine all words in the
    middle into one.
    
    Paramters
    ---------
    string  -  The name to be converted.
    nparray -  Whether to return a list or an numpy array.
    
    Returns
    -------
    Either a list or a numpy array of features.
    '''
    # Ensure the string is lowercase.
    string = string.lower()
    
    # Clean out any numbers.
    string = re.sub('\d', '', string)
    string = re.sub('_', '', string)
    
    # Clean out accents, tildes, etc.
    string = ''.join((c for c in unicodedata.normalize('NFD', u'{}'.format(string)) if unicodedata.category(c) != 'Mn'))
    
    # Clean out non-ascii characters.
    string = string.encode('ascii', errors='ignore').decode()
    
    # Initialize the array.
    vals = []
    
    # Get each value.
    allwords = re.findall('\w+', string)
    if len(allwords) == 0:
        allwords = [u'']
    
    # Assign first name.
    first = allwords[0]
    
    # Assign last name if not the same as first.
    if first != allwords[-1]:
        last = allwords[-1]
    else:
        last = u''
    
    # Assign middle name, smashing all words in between first and last together.
    middle = u''
    for x in allwords[1:-1]:
        middle = middle + x
        
    # Define a mapping for letters to numbers.
    mapping = {'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4, 'e' : 5, 'f' : 6, 'g' : 7, 'h' : 8, 'i' : 9, 'j' : 10, 'k' : 11, 'l' : 12,
                  'm' : 13, 'n' : 14, 'o' : 15, 'p' : 16, 'q' : 17, 'r' : 18, 's' : 19, 't' : 20, 'u' : 21, 'v' : 22, 'w' : 23,
                  'x' : 24, 'y' : 25, 'z' : 26}
    
    # Loop over the various names
    for name in [first, middle, last]:
        # If nothing for any of the names, return zeros.
        #if len(name) == 0:
        #    for x in range(34):
        #        vals.append(0)
        #    
        #    continue
        
        # Get the length of the string.
        vals.append(len(name))

        # Get the number of each letter.
        for x, y in enumerate('abcdefghijklmnopqrstuvwxyz'):
            vals.append(name.count(y))

        # Map the first three and last letters to numbers.
        if len(name) >= 3:
            vals.append(mapping[name[0]])
            vals.append(mapping[name[1]])
            vals.append(mapping[name[2]])
            vals.append(mapping[name[-1]])
        elif len(name) == 2:
            vals.append(mapping[name[0]])
            vals.append(mapping[name[1]])
            vals.append(randint(1,26))
            vals.append(mapping[name[-1]]) 
        elif len(name) == 1:
            vals.append(mapping[name[0]])
            vals.append(randint(1,26))
            vals.append(randint(1,26))
            vals.append(mapping[name[-1]])
        else:
            vals.append(randint(1,26))
            vals.append(randint(1,26))
            vals.append(randint(1,26))
            vals.append(randint(1,26))
            

        # Get the soundex value.
        sound = jellyfish.soundex(u'{}'.format(name))

        # Convert the soundex value to a number.
        if sound != '':
            vals.append((mapping[sound[0].lower()]-1)*7*7*7 + int(sound[1])*7*7 + int(sound[2])*7 + (int(sound[3])+1))
        else:
            vals.append(randint(1,8576))
        
        # Initialize the number of vowels and consonants.
        vowels = 0
        const = 0

        # Count the number of vowels and consonants.
        for x in name:
            if x in 'aeiouy':
                vowels += 1
            else:
                const += 1

        vals.append(vowels)
        vals.append(const)
        
    # Get the number of words.
    if allwords != [u'']:
        vals.append(len(allwords))
    else:
        vals.append(0)

    # Normalize the data.
    #for x in range(len(vals)):
    #    vals[x] = vals[x]/(10**(len(str(vals[x]))))
    
    # Return the values.
    if nparray == True:
        return np.array(vals)
    else:
        return list(vals)

In [3]:
def Normalize(vec1, vec2):
    '''
    Normalize each element of the vectors based on the maximum value at that spot in the row anywhere in either matrix.
    Both parameters should be pandas columns of lists.
    
    Parameters
    ----------
    vec1  -  First column of values.
    vec2  -  Second column of values.
    
    Returns
    -------
    Two numpy arrays.
    '''
    # Zip the lists together correctly.
    vec1 = list(zip(*vec1))
    vec2 = list(zip(*vec2))
    
    # Get the maximum and divide the values.
    for x in range(len(vec1)):
        max_val = float(np.array((vec1[x] + vec2[x])).max())
        vec1[x] = [y/max_val for y in vec1[x]]
        vec2[x] = [y/max_val for y in vec2[x]]
        
    # Get the numpy arrays.
    arr1 = [np.array(x) for x in list(zip(*vec1))]
    arr2 = [np.array(x) for x in list(zip(*vec2))]
    
    # Return the arrays.
    return arr1, arr2

Below we read in the data and fix the 'Match' column, we had a few values oddly coded.

In [4]:
# Read in the data.
df = pd.read_csv(r'C:\Users\tanne\Downloads\names_final.csv')

# Replace a few values.
df['Match'] = df.Match.map({1 : 0, '0' : 1, 'match' : 1, 0 : 1})

  interactivity=interactivity, compiler=compiler, result=result)


Let's get the columns we need and make a train/test split so we can validate what we are doing.

In [5]:
# Assign the target and training data, then delete df to save memory.
target = df.Match.astype(int)
train = df[df.columns[6:]]
del df

# Make a train/test split.
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.33, random_state=42)

Okay, so we need to think about our model a little bit. We need to recover weights so that we can do the euclidean distances between the two sets of names, so linear regression is an obvious choice as the coefficients can be used easily as a type of weighting. Another is logit, which coefficients can also be used directly. We are also going to try random forest and gradient boosting, then we will naively use the feature importances as weights and see what happens.

First we will start with linear regression and save the coefficients.

In [6]:
# Linear regression.
lr = LinearRegression()

# Fit the model.
lr.fit(X_train, y_train)

# Assign the coefficients.
lr_coefs = lr.coef_

Next we will run logit, and in this case we can both save the coefficients and actually directly make predictions, so we will do that as well.

In [7]:
# Logit.
lo = LogisticRegression()

# Fit the model.
lo = lo.fit(X_train,y_train)

# Get the predictions for the model.
train_predict = lo.predict(X_train)
test_predict = lo.predict(X_test)

# Print out the MSE and the accuracy.
print("Train MSE {}".format(mean_squared_error(y_train, train_predict)))
print("Test MSE {}".format(mean_squared_error(y_test, test_predict)))
print("Train Accuracy {}".format(accuracy_score(y_train, train_predict)))
print("Test Accuracy {}".format(accuracy_score(y_test, test_predict)))

# Assign the coefficients.
lo_coefs = lo.coef_

Train MSE 0.04956716417910448
Test MSE 0.04863636363636364
Train Accuracy 0.9504328358208955
Test Accuracy 0.9513636363636364


Wow, we actually did really well with our prediction accuracy on the test set at about 95 percent. Let's pick up with random forest now and see how we do.

In [8]:
# Random Forest.
rf = RandomForestClassifier()

# Fit the model.
rf.fit(X_train, y_train)

# Get the predictions for the model.
train_predict = rf.predict(X_train)
test_predict = rf.predict(X_test)

# Print out the MSE and the accuracy.
print("Train MSE {}".format(mean_squared_error(y_train, train_predict)))
print("Test MSE {}".format(mean_squared_error(y_test, test_predict)))
print("Train Accuracy {}".format(accuracy_score(y_train, train_predict)))
print("Test Accuracy {}".format(accuracy_score(y_test, test_predict)))

# Get the feature importances.
rf_coefs = rf.feature_importances_

Train MSE 0.0011492537313432835
Test MSE 0.0155
Train Accuracy 0.9988507462686567
Test Accuracy 0.9845


Even better accuracy here, about 98 percent! Lastly we will try gradient boosting.

In [9]:
# XG Boost.
xg = GradientBoostingClassifier()

xg.fit(X_train, y_train)

train2_predict = xg.predict(X_train)
test2_predict = xg.predict(X_test)

print("Train MSE {}".format(mean_squared_error(y_train, train2_predict)))
print("Test MSE {}".format(mean_squared_error(y_test, test2_predict)))
print("Train Accuracy {}".format(accuracy_score(y_train, train2_predict)))
print("Test Accuracy {}".format(accuracy_score(y_test, test2_predict)))

# Get the feature importances.
xg_coefs = xg.feature_importances_

Train MSE 0.02067910447761194
Test MSE 0.020727272727272726
Train Accuracy 0.9793208955223881
Test Accuracy 0.9792727272727273


Not quite as good as the random forest, but still a solid 97 percent accuracy.

Now we are going to run an independent validation of our models using the saved weights. Our motivation here is that pairwise comparisons between names are often very slow, so if we can get the right weighted euclidean distance then we can use matrix multiplication to make the pairwise comparisons, which tends to be faster.

We read in the original data to recover the vectors and prepare it using the same functions.

In [10]:
# Validation.
# Read in the data.
df = pd.read_csv(r'C:\Users\tanne\Downloads\true_names.csv', encoding='utf-8', nrows=100000)
false = pd.read_csv(r'C:\Users\tanne\Downloads\false_names2.csv', encoding='utf-8', nrows=100000)

# Keep the correct columns for false and rename.
false = false[['fsid', 'full1', 'first1', 'mid1', 'last1', 'full2', 'first2', 'mid2', 'last2', 'match']]
false.columns = ['fsid', 'Full1', 'First1', 'Mid1', 'Last1', 'Full2', 'First2', 'Mid2', 'Last2', 'Match']

# Append the data together.
df = df.append(false, ignore_index=True)

# Replace a few values.
df['Match'] = df.Match.map({1 : 0, '0' : 1, 'match' : 1, 0 : 1})

In [11]:
# Get rid of missing values.
for x in ['Full1', 'Full2', 'First1', 'First2', 'Mid1', 'Mid2', 'Last1', 'Last2']:
    df[x] = df[x].fillna('')

In [12]:
# Vectorize the names.
df['Full1vec'] = df.Full1.apply(VectorizeName)
df['Full2vec'] = df.Full2.apply(VectorizeName)

In [13]:
# Normalize the name vectors.
df['Full1vec'], df['Full2vec'] = Normalize(df['Full1vec'], df['Full2vec'])

Now we will make weighted versions of the vectors using each of the different model weights.

In [14]:
# Weight using the linear regression weights.
df['lr1'] = df.Full1vec * np.array(lr_coefs)
df['lr2'] = df.Full2vec * np.array(lr_coefs)

# Weight using the logit weights
df['lo1'] = df.Full1vec * np.array(lo_coefs)
df['lo2'] = df.Full2vec * np.array(lo_coefs)

# Weight using the random forest weights
df['rf1'] = df.Full1vec * np.array(rf_coefs)
df['rf2'] = df.Full2vec * np.array(rf_coefs)

# Weight using the xg boost weights
df['xg1'] = df.Full1vec * np.array(xg_coefs)
df['xg2'] = df.Full2vec * np.array(xg_coefs)

Next we get the euclidean distances.

In [15]:
# Get the euclidean distances.
df['score'] = df.apply(lambda x: np.linalg.norm(x.Full1vec - x.Full2vec), axis=1)
df['scorelr'] = df.apply(lambda x: np.linalg.norm(x.lr1 - x.lr2), axis=1)
df['scorelo'] = df.apply(lambda x: np.linalg.norm(x.lo1 - x.lo2), axis=1)
df['scorerf'] = df.apply(lambda x: np.linalg.norm(x.rf1 - x.rf2), axis=1)
df['scorexg'] = df.apply(lambda x: np.linalg.norm(x.xg1 - x.xg2), axis=1)

In order to find a cutoff for the distances we just heuristically look at the summary statistics for each set of scores and choose a cutoff that reasonably splits the matches from the non-matches.

In [16]:
# Print out the summary statistics.
print('Unweighted Non-Matches:\n', df.score.loc[df.Match == 1].describe())
print('Unweighted Matches:\n', df.score.loc[df.Match == 0].describe())

print('Linear Regression Non-Matches:\n', df.scorelr.loc[df.Match == 1].describe())
print('Linear Regression Matches:\n', df.scorelr.loc[df.Match == 0].describe())

print('Logit Non-Matches:\n', df.scorelo.loc[df.Match == 1].describe())
print('Logit Matches:\n', df.scorelo.loc[df.Match == 0].describe())

print('Random Forest Non-Matches:\n', df.scorerf.loc[df.Match == 1].describe())
print('Random Forest Matches:\n', df.scorerf.loc[df.Match == 0].describe())

print('XG Boost Non-Matches:\n', df.scorexg.loc[df.Match == 1].describe())
print('XG Boost Matches:\n', df.scorexg.loc[df.Match == 0].describe())

Unweighted Non-Matches:
 count    100000.000000
mean          1.199137
std           0.268797
min           0.086003
25%           1.015872
50%           1.197606
75%           1.381173
max           2.567467
Name: score, dtype: float64
Unweighted Matches:
 count    100000.000000
mean          1.171367
std           0.427775
min           0.000000
25%           0.895745
50%           1.144085
75%           1.449079
max           3.554618
Name: score, dtype: float64
Linear Regression Non-Matches:
 count    100000.000000
mean          0.764453
std           0.459306
min           0.005177
25%           0.546967
50%           0.665241
75%           0.839765
max          15.659578
Name: scorelr, dtype: float64
Linear Regression Matches:
 count    100000.000000
mean          3.847041
std           2.831711
min           0.000000
25%           1.447394
50%           4.146727
75%           6.221056
max          42.294875
Name: scorelr, dtype: float64
Logit Non-Matches:
 count    100000.000000

Below are our cutoffs. They are a little odd, the math did not quite work out as anticipated, but as you will see below we ended up with a reasonable accuracy for the linear regression weighting.

In [17]:
# Make predictions.
df['un_pred'] = (df.score < 1.5)
df['lr_pred'] = (df.scorelr < 1.4) & (df.scorelr != 0.0)
df['lo_pred'] = (df.scorelo < 7.3) & (df.scorelo != 0.0)
df['rf_pred'] = (df.scorerf < 0.03) & (df.scorerf != 0.0)
df['xg_pred'] = (df.scorexg < 0.03) & (df.scorexg != 0.0)

In [18]:
# Print accuracy.
print('Unweighted Accuracy:', (df.un_pred == df.Match).sum()/len(df))
print('Linear Regression Accuracy:', (df.lr_pred == df.Match).sum()/len(df))
print('Logit Accuracy:', (df.lo_pred == df.Match).sum()/len(df))
print('Random Forest Accuracy:', (df.rf_pred == df.Match).sum()/len(df))
print('XG Boost Accuracy:', (df.xg_pred == df.Match).sum()/len(df))

Unweighted Accuracy: 0.544545
Linear Regression Accuracy: 0.874245
Logit Accuracy: 0.725355
Random Forest Accuracy: 0.563765
XG Boost Accuracy: 0.642005


Great! So there is still quite a bit of work to do before this would be useful in functionally matching names, but we made strides to another name matching method that might have some value.