Source: https://www.kaggle.com/datagov/usa-names/data
<br>Example Code from: https://www.kaggle.com/diamazov/unique-generation-names

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly as plo
import seaborn as sns
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [2]:
names = pd.read_csv('../data/usnames/names_per_year.csv')
del names['Unnamed: 0']
names.gender = names['gender'].apply(lambda val: 0 if val == 'F' else 1)

In [3]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613845 entries, 0 to 613844
Data columns (total 4 columns):
year          613845 non-null int64
gender        613845 non-null int64
name          613845 non-null object
year_total    613845 non-null int64
dtypes: int64(3), object(1)
memory usage: 18.7+ MB


In [4]:
def vowelcheck(value):
    string=value.lower()
    vowels=0
    for i in string:
          if(i=='a' or i=='e' or i=='i' or i=='o' or i=='u'):
                vowels=vowels+1
    return vowels/len(string)

def consonantcluster(value):
    string=value.lower()
    end = len(string) - 1
    cc = 0
    for i, val in enumerate(string):
        if (i != end) and not (val=='a' or val=='e' or val=='i' or val=='o' or val=='u'):
                if string[i] == string[i+1]:
                    cc = cc + 1
                    
    return cc

def ycount(value):
    string=value.lower()
    y=0
    for i in string:
          if(i=='y'):
                y=y+1
    return y

In [5]:
names['len'] = names['name'].apply(lambda x: len(x))
names['vowelp'] = names['name'].apply(vowelcheck)
names['cc'] = names['name'].apply(consonantcluster)
names['yc'] = names['name'].apply(ycount)

In [6]:
names.head()

Unnamed: 0,year,gender,name,year_total,len,vowelp,cc,yc
0,1921,0,Ah,5,2,0.5,0,0
1,1915,0,Ah,5,2,0.5,0,0
2,2000,0,Ai,5,2,1.0,0,0
3,2002,0,Ai,7,2,1.0,0,0
4,1995,0,Ai,5,2,1.0,0,0


In [7]:
test = names.sample(frac=0.001)
testnames = test.name
testgender = test.gender
del test['year_total']
del test['name']
del test['gender']

In [23]:
test.head()

Unnamed: 0,year,len,vowelp,cc,yc
25045,2006,4,0.5,0,1
214438,1912,7,0.571429,0,0
219654,1944,7,0.285714,0,1
321398,2017,8,0.375,1,0
489816,2005,6,0.333333,1,0


In [8]:
namcheck = names.copy()
del namcheck['year_total']

In [9]:
features = namcheck.groupby(['year','gender']).mean().add_suffix('_avg').reset_index()
features.head()

Unnamed: 0,year,gender,len_avg,vowelp_avg,cc_avg,yc_avg
0,1910,0,5.930748,0.473171,0.232687,0.10434
1,1910,1,5.776012,0.38874,0.143064,0.141618
2,1911,0,5.964353,0.471266,0.234522,0.104128
3,1911,1,5.790451,0.390218,0.156499,0.136605
4,1912,0,5.994449,0.472529,0.234734,0.106265


In [10]:
# Sample data to play with.
features = features.sample(frac=1.0)

y = features['gender'].values.reshape(-1, 1)
X = features[['year','len_avg','vowelp_avg','cc_avg','yc_avg']]

print(np.shape(y))
print(np.shape(X))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)  

(216, 1)
(216, 5)


## Linear Regression (Ordinary Least Squares)

In [11]:
# Instantiate our model.
regr = linear_model.LinearRegression()

# Fit our model to our data.
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [12]:
# Display the attributes we calculated.
print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)

Coefficients: 
 [[-1.16983636e-03 -5.96708474e-01 -9.40980267e+00 -4.23620396e+00
  -9.61035944e-01]]
Intercept: 
 [11.31124947]


In [13]:
regr.score(X_test,y_test)

0.9852108136632682

In [14]:
# For some reason predicted values are ranging from -1 to 5
l = regr.predict(test).ravel()
newl = []
for val in l:
    if (val >=1) or (val <= -1):
        newl.append(1)
    else:
        newl.append(0)
    

In [15]:
final = test.copy()
final['gender'] = testgender
final['names'] = testnames
final['guessval'] = newl

In [24]:
final

Unnamed: 0,year,len,vowelp,cc,yc,gender,names,guessval
25045,2006,4,0.500000,0,1,0,Laya,0
214438,1912,7,0.571429,0,0,0,America,0
219654,1944,7,0.285714,0,1,0,Beverly,0
321398,2017,8,0.375000,1,0,0,Sapphire,0
489816,2005,6,0.333333,1,0,1,Jerrod,0
356321,2012,3,0.333333,0,0,1,Tim,0
77082,1942,5,0.600000,0,0,0,Katie,1
599512,1946,8,0.375000,0,0,1,Valdemar,1
97639,2006,5,0.600000,0,0,0,Paula,0
132758,2009,6,0.500000,0,1,0,Caylie,1


In [16]:
print('{0:f}% of names were correctly identified by gender'.format(100*len(final.loc[final['gender'] == final['guessval']]) / len(final)))

52.605863% of names were correctly identified by gender


In [17]:
final.head()

Unnamed: 0,year,len,vowelp,cc,yc,gender,names,guessval
25045,2006,4,0.5,0,1,0,Laya,0
214438,1912,7,0.571429,0,0,0,America,0
219654,1944,7,0.285714,0,1,0,Beverly,1
321398,2017,8,0.375,1,0,0,Sapphire,1
489816,2005,6,0.333333,1,0,1,Jerrod,1


## K Nearest Neighbors

In [18]:
from sklearn.neighbors import KNeighborsClassifier

In [19]:
neighbors = KNeighborsClassifier(n_neighbors=5)
neighbors.fit(X_train, y_train)

print(neighbors.predict(test))
print(neighbors.predict_proba(test))

[0 0 0 0 0 0 1 1 0 1 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1
 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 1 1
 1 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 1 1 0 0
 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0 0 1 1 0 0 0 1 1 1 1 0
 0 1 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 1 1 0 0 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 0
 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1
 1 0 1 0 1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 1
 0 1 1 0 0 1 1 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 1
 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1 1 0
 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0
 0 1 1 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 1 1 0 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1
 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1
 0 1 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 0 1 1 1 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0
 1 1 1 0 0 1 0 1 0 1 1 0 


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [20]:
newl = neighbors.predict(test).ravel()
final = test.copy()
final['gender'] = testgender
final['names'] = testnames
final['guessval'] = newl

In [21]:
print('{0:f}% of names were correctly identified by gender'.format(100*len(final.loc[final['gender'] == final['guessval']]) / len(final)))

54.071661% of names were correctly identified by gender


In [22]:
final.head()

Unnamed: 0,year,len,vowelp,cc,yc,gender,names,guessval
25045,2006,4,0.5,0,1,0,Laya,0
214438,1912,7,0.571429,0,0,0,America,0
219654,1944,7,0.285714,0,1,0,Beverly,0
321398,2017,8,0.375,1,0,0,Sapphire,0
489816,2005,6,0.333333,1,0,1,Jerrod,0
