# Gender Prediction By Name

In [116]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction import DictVectorizer

In [117]:
dataset = pd.read_csv("name.csv")
dataset

Unnamed: 0,name,gender
0,alfiya,f
1,ardwin,m
2,henryka,f
3,preeti,f
4,jamaro,m
...,...,...
125226,loreatha,f
125227,deepa,f
125228,kinshasa,f
125229,charlianne,f


In [118]:
# dataset.size

In [119]:
dataset.columns

Index(['name', 'gender'], dtype='object')

In [120]:
dataset.dtypes

name      object
gender    object
dtype: object

In [121]:
#  Cheking the missing value in the datase.
dataset.isnull().sum()

name      56
gender     0
dtype: int64

In [122]:
# dataset["name"].fillna(dataset["name"].mode(), inplace=True)
# dataset
dataset.fillna({"name": dataset["name"].mode()[0]}, inplace = True)

In [123]:
dataset.isnull().sum()

name      0
gender    0
dtype: int64

In [124]:
#  Number of female Names.
dataset.groupby("gender").count()

#  we have the 75k+ female and rest all are males.

Unnamed: 0_level_0,name
gender,Unnamed: 1_level_1
f,75971
m,49260


In [125]:
#  Copy the dataset into the another variable..
copy_df = dataset
copy_df["gender"].replace({"f" : 0, "m": 1}, inplace=True)
copy_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  copy_df["gender"].replace({"f" : 0, "m": 1}, inplace=True)
  copy_df["gender"].replace({"f" : 0, "m": 1}, inplace=True)


Unnamed: 0,name,gender
0,alfiya,0
1,ardwin,1
2,henryka,0
3,preeti,0
4,jamaro,1
...,...,...
125226,loreatha,0
125227,deepa,0
125228,kinshasa,0
125229,charlianne,0


In [126]:
copy_df.head()

Unnamed: 0,name,gender
0,alfiya,0
1,ardwin,1
2,henryka,0
3,preeti,0
4,jamaro,1


In [127]:
#  check the unique value insode the dataset.
copy_df["gender"].value_counts()

gender
0    75971
1    49260
Name: count, dtype: int64

In [128]:
copy_df.dtypes

name      object
gender     int64
dtype: object

In [129]:
#  Divide into the dependent and the independent features.
x_feature = copy_df["name"]

In [130]:
#  Count Vectorizer..
cv =  CountVectorizer()
x = cv.fit_transform(x_feature.values.astype('U'))

In [133]:
x.shape

(125231, 101784)

In [134]:
#  Save the vector database onto that file.
gender_vectorizer = open("gender_vectorizer.pkl", "wb")
joblib.dump(cv, gender_vectorizer)

In [135]:
gender_vectorizer.close()

In [217]:
#  Find the array of names 
print(cv.get_feature_names_out())

['aaban' 'aabha' 'aabid' ... 'सर' 'सलम' 'हन']


In [137]:
y = copy_df.gender
y

0         0
1         1
2         0
3         0
4         1
         ..
125226    0
125227    0
125228    0
125229    0
125230    1
Name: gender, Length: 125231, dtype: int64

In [138]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [149]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)
model.score(x_test, y_test) * 100

71.04643270651175

In [150]:
print("Accuracyof model for the Testing -> {} %".format(model.score(x_test, y_test)))

Accuracyof model for the Testing -> 0.7104643270651175 %


In [151]:
print("Accuracyof model for the Training -> {} %".format(model.score(x_train, y_train)))

Accuracyof model for the Training -> 0.9913758684021401 %


In [152]:
#  Prediction 

In [174]:
sample_name = ["Suchitra"]
# vect = np.asarray(sample_name)
print(type(vect))

vect = cv.transform(sample_name).toarray()
model.predict(vect)

<class 'numpy.ndarray'>


array([0], dtype=int64)

In [188]:
sample_name = ["koko", "Tushar", "Lalit", "Animal", "Book", "Dhoni"]
# vect = np.asarray(sample_name)
# print(type(vect))

vect = cv.transform(sample_name).toarray()
# print(type(vect))
model.predict(vect)

array([0, 1, 1, 0, 0, 0], dtype=int64)

In [214]:
def gender_predictor(name):
    test_name = [name]
    vector = cv.transform(test_name).toarray()
    if model.predict(vector) == 0:
        return "Female"
    else:
        return "Male"

In [216]:
#  Call the gender predictor function..
# gender_predictor("Dhoni")
names = ["Tushar", "Arun", "Komal", "Lalit"]
for i in names:
    print(i, "->", gender_predictor(i))

Tushar -> Male
Arun -> Male
Komal -> Female
Lalit -> Male


In [222]:
# Custom Feature Analysis.
#  By Analogy Most Female names ends in A or I has the sound of A

def features(name):
    name = str(name)
    name = name.lower()
    return {
        'first-letter': name[0],
        'first2-letter': name[0:2],
        'first3-letter': name[0:3],
        'last-letter': name[-1],
        'last2-letter': name[-2:],
        'last3-letter': name[-3:],
    }

In [223]:
features = np.vectorize(features)
# print(features)
features(["Tushar", "Arun", "Komal", "Lalit"])

array([{'first-letter': 't', 'first2-letter': 'tu', 'first3-letter': 'tus', 'last-letter': 'r', 'last2-letter': 'ar', 'last3-letter': 'har'},
       {'first-letter': 'a', 'first2-letter': 'ar', 'first3-letter': 'aru', 'last-letter': 'n', 'last2-letter': 'un', 'last3-letter': 'run'},
       {'first-letter': 'k', 'first2-letter': 'ko', 'first3-letter': 'kom', 'last-letter': 'l', 'last2-letter': 'al', 'last3-letter': 'mal'},
       {'first-letter': 'l', 'first2-letter': 'la', 'first3-letter': 'lal', 'last-letter': 't', 'last2-letter': 'it', 'last3-letter': 'lit'}],
      dtype=object)

In [224]:
df_X = features(copy_df['name'])

In [225]:
df_Y = copy_df['gender']

In [226]:
corpus = features(["Aarav", "Chandni"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)

In [228]:
print(transformed)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12 stored elements and shape (2, 12)>
  Coords	Values
  (0, 0)	1.0
  (0, 2)	1.0
  (0, 4)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 11)	1.0
  (1, 1)	1.0
  (1, 3)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 10)	1.0


In [229]:
dv.get_feature_names_out()

array(['first-letter=a', 'first-letter=c', 'first2-letter=aa',
       'first2-letter=ch', 'first3-letter=aar', 'first3-letter=cha',
       'last-letter=i', 'last-letter=v', 'last2-letter=av',
       'last2-letter=ni', 'last3-letter=dni', 'last3-letter=rav'],
      dtype=object)

In [230]:
dfx_train, dfx_test, dfy_train, dfy_test = train_test_split(df_X, df_Y, test_size=0.2, random_state=42)

In [231]:
dfx_train

array([{'first-letter': 't', 'first2-letter': 'te', 'first3-letter': 'tem', 'last-letter': 'a', 'last2-letter': 'ca', 'last3-letter': 'eca'},
       {'first-letter': 'm', 'first2-letter': 'ma', 'first3-letter': 'mar', 'last-letter': 'd', 'last2-letter': 'id', 'last3-letter': 'rid'},
       {'first-letter': 'a', 'first2-letter': 'av', 'first3-letter': 'ava', 'last-letter': 'e', 'last2-letter': 'se', 'last3-letter': 'ose'},
       ...,
       {'first-letter': 'n', 'first2-letter': 'ny', 'first3-letter': 'nya', 'last-letter': 'l', 'last2-letter': 'al', 'last3-letter': 'yal'},
       {'first-letter': 't', 'first2-letter': 'ti', 'first3-letter': 'tin', 'last-letter': 'a', 'last2-letter': 'ya', 'last3-letter': 'iya'},
       {'first-letter': 'a', 'first2-letter': 'ab', 'first3-letter': 'abb', 'last-letter': 'l', 'last2-letter': 'el', 'last3-letter': 'ael'}],
      dtype=object)

In [232]:
dv = DictVectorizer()
dv.fit_transform(dfx_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 601104 stored elements and shape (100184, 9284)>

In [234]:
from sklearn.tree import DecisionTreeClassifier

In [235]:
dclf = DecisionTreeClassifier()
dv.transform(dfx_train)
x_features = dv.transform(dfx_train)
dclf.fit(x_features, dfy_train)

In [248]:
sample_name_eg = ["Priyanshu"]
transform_dv = dv.transform(features(sample_name_eg)).toarray()
dclf.predict(transform_dv)

array([0], dtype=int64)

In [259]:
sample_name_eg2 = ["Pooja"]
transform_dv2 = dv.transform(features(sample_name_eg2)).toarray()
vect_1 = dclf.predict(transform_dv2)
if vect_1 == 0:
    print("Female")
else:
    print("Male")

Female


In [261]:
decision_tree = open("decisiontree.pkl", "wb")

In [262]:
joblib.dump(dclf, decision_tree)

In [263]:
import pickle
decision_tree_01 = open("name_detector_model", "wb")
pickle.dump(dclf, decision_tree_01)
decision_tree_01.close()

In [265]:
naive_byes = open("naivebayes.pkl", "wb")
joblib.dump(model, naive_byes)
naive_byes.close()