# **Gender Classification Of Names**

# Using Machine Learning To Detect/Predict Gender of Individuals From their Names
* Sklearn
* Pandas
* Text Extraction

In [7]:
# EDA packages
import pandas as pd
import numpy as np

In [8]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# Load our data
df = pd.read_csv('names_dataset.csv')

In [None]:
df.head()

Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [None]:
df.size

285075

In [None]:
# Data Cleaning
# Checking for column name consistency
df.columns

Index(['index', 'name', 'sex'], dtype='object')

In [None]:
# Data Types
df.dtypes

index     int64
name     object
sex      object
dtype: object

In [None]:
# Checking for Missing Values
df.isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [None]:
# Number of Female Names
df[df.sex == 'F'].size

181800

In [None]:
# Number of Male Names
df[df.sex == 'M'].size

103275

In [None]:
df_names = df

In [None]:
# Replacing All F and M with 0 and 1 respectively
df_names.sex.replace({'F':0,'M':1},inplace=True)

In [None]:
df_names.sex.unique()

array([0, 1])

In [None]:
df_names.dtypes

index     int64
name     object
sex       int64
dtype: object

In [None]:
Xfeatures =df_names['name']
Xfeatures

0             Mary
1             Anna
2             Emma
3        Elizabeth
4           Minnie
           ...    
95020     Zecharya
95021       Ziheng
95022         Ziyu
95023        Zykir
95024         Zyus
Name: name, Length: 95025, dtype: object

In [None]:
# Feature Extraction
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [None]:
# Get feature names
feature_names = cv.get_feature_names_out()

# Print all feature names
for name in feature_names:
    print(name)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
vienne
vieno
viera
vieri
vierra
viesha
viet
vietta
vieva
viggo
viginia
vignesh
vigo
viha
vihaa
vihaan
vihaanreddy
vihaas
vihan
vihana
vihas
vija
vijay
vijaya
vika
vikas
vikash
vikesh
vikhyath
viki
vikie
vikita
vikki
vikkie
vikky
vikram
vikramjit
vikrant
vikranth
viktor
viktoria
viktorija
viktoriya
viktorya
viky
vila
vilas
vilate
vilda
vildan
vilena
vilene
viletta
vilho
vili
vilia
viliami
viliamu
vilija
vilinda
villa
villard
ville
villie
vilma
vilmarie
vimal
vimala
vin
vina
vinal
vinathi
vinay
vinaya
vinayak
vincci
vince
vincel
vincen
vincene
vincent
vincenta
vincente
vincentia
vincentina
vincentine
vincentmichael
vincenza
vincenzia
vincenzina
vincenzio
vincenzo
vincetta
vinchenzo
vinci
vincie
vincient
vincil
vincint
vincy
vinda
vindetta
vindhya
vine
vinecia
vineel
vineet
vineeta
vineeth
vineisha
vinell
vinesh
vinesha
vinessa
vineta
vinetta
vinette
viney
vinh
vinia
vinicio
vinicius
vinie
vinisha
vinit
vinita
vinn
vinna
vin

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Features
X
# Labels
y = df_names.sex

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.6398163206734908

In [None]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 63.98163206734908 %


In [None]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_train,y_train)*100,"%")

Accuracy of Model 100.0 %


# **Sample Prediction**

In [None]:
# Sample1 Prediction
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()

In [None]:
vect

array([[0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Female is 0, Male is 1
clf.predict(vect)

array([0])

In [None]:
# Sample2 Prediction
sample_name1 = ["Mark"]
vect1 = cv.transform(sample_name1).toarray()

In [None]:
clf.predict(vect1)

array([1])

In [None]:
# Sample3 Prediction of Russian Names
sample_name2 = ["Natasha"]
vect2 = cv.transform(sample_name2).toarray()

In [None]:
clf.predict(vect2)

array([0])

In [None]:
# Sample3 Prediction of Random Names
sample_name3 = ["Nefertiti","Nasha","Ama","Ayo","Xhavier","Ovetta","Tathiana","Xia","Joseph","Xianliang"]
vect3 = cv.transform(sample_name3).toarray()

In [None]:
clf.predict(vect3)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [None]:
# A function to do it
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")

In [None]:
genderpredictor("Martha")

Female




* Features fxn
* apply the fxn
* vectorizer
* fit
* transform
* classifier
* fit
* predict

In [None]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

Female
None
Male
None
Female
None
Female
None


# Using a custom function for feature analysis

In [None]:
# By Analogy most female names ends in 'A' or 'E' or has the sound of 'A'
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [None]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [None]:
# Extract the features for the dataset
df_X = features(df_names['name'])

In [None]:
df_y = df_names['sex']

In [None]:
from sklearn.feature_extraction import DictVectorizer

corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [None]:
dv.get_feature_names_out()

array(['first-letter=j', 'first-letter=m', 'first2-letters=ju',
       'first2-letters=mi', 'first3-letters=jul', 'first3-letters=mik',
       'last-letter=a', 'last-letter=e', 'last2-letters=ia',
       'last2-letters=ke', 'last3-letters=ike', 'last3-letters=lia'],
      dtype=object)

In [None]:
# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [None]:
dfX_train

array([{'first-letter': 'e', 'first2-letters': 'el', 'first3-letters': 'ele', 'last-letter': 'a', 'last2-letters': 'ia', 'last3-letters': 'nia'},
       {'first-letter': 'a', 'first2-letters': 'ad', 'first3-letters': 'adi', 'last-letter': 'l', 'last2-letters': 'il', 'last3-letters': 'dil'},
       {'first-letter': 'k', 'first2-letters': 'ka', 'first3-letters': 'kad', 'last-letter': 'e', 'last2-letters': 'ze', 'last3-letters': 'nze'},
       ...,
       {'first-letter': 'j', 'first2-letters': 'ja', 'first3-letters': 'jaz', 'last-letter': 'y', 'last2-letters': 'ly', 'last3-letters': 'zly'},
       {'first-letter': 'e', 'first2-letters': 'el', 'first3-letters': 'elv', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'ina'},
       {'first-letter': 'l', 'first2-letters': 'le', 'first3-letters': 'led', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ger'}],
      dtype=object)

In [None]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<63666x8194 sparse matrix of type '<class 'numpy.float64'>'
	with 381996 stored elements in Compressed Sparse Row format>

In [None]:
# Model building Using DecisionTree

from sklearn.tree import DecisionTreeClassifier

dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

print("Criterion:", dclf.criterion)
print("Max depth:", dclf.max_depth)
print("Min samples leaf:", dclf.min_samples_leaf)
print("Min samples split:", dclf.min_samples_split)
print("Splitter:", dclf.splitter)


Criterion: gini
Max depth: None
Min samples leaf: 1
Min samples split: 2
Splitter: best


In [None]:
# Build Features and Transform them
sample_name_eg = ["Alex"]
transform_dv =dv.transform(features(sample_name_eg))

In [None]:
vect3 = transform_dv.toarray()

In [None]:
# Predicting Gender of Name
# Male is 1,female = 0
dclf.predict(vect3)

array([1])

In [None]:
if dclf.predict(vect3) == 0:
    print("Female")
else:
    print("Male")

Male


In [None]:
# Second Prediction With Nigerian Name
name_eg1 = ["Chioma"]
transform_dv =dv.transform(features(name_eg1))
vect4 = transform_dv.toarray()
if dclf.predict(vect4) == 0:
    print("Female")
else:
    print("Male")

Female


In [None]:
# A function to do it
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")

In [None]:
random_name_list = ["Alex","Alice","Chioma","Vitalic","Clairese","Chan"]

In [None]:
for n in random_name_list:
    print(genderpredictor1(n))

Male
None
Female
None
Female
None
Male
None
Female
None
Male
None


In [None]:
## Accuracy of Models Decision Tree Classifier Works better than Naive Bayes
# Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train))

0.9888951716771903


In [None]:
# Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.8660033802098281


# Saving Our Model

In [None]:
import joblib

In [None]:
decisiontreModel = open("decisiontreemodel.pkl","wb")

In [None]:
joblib.dump(dclf,decisiontreModel)

In [None]:
decisiontreModel.close

<function BufferedWriter.close>

In [None]:
#Alternative to Model Saving
import pickle
dctreeModel = open("namesdetectormodel.pkl","wb")

In [None]:
pickle.dump(dclf,dctreeModel)

In [None]:
dctreeModel.close()

### Save Multinomial NB Model

In [None]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [None]:
joblib.dump(clf,NaiveBayesModel)

In [None]:
NaiveBayesModel.close()