In [2]:
#Necessary imports
import pandas as pd
import matplotlib 
import matplotlib.pyplot as plt

In [3]:
# Loading dataset 
male_df = pd.read_csv('training-male.csv')
female_df = pd.read_csv('training-female.csv')

In [4]:
male_df.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [5]:
female_df.head()

Unnamed: 0,name,gender,race
0,shivani,f,indian
1,isha,f,indian
2,smt shyani devi,f,indian
3,divya,f,indian
4,mansi,f,indian


In [6]:
# Joining two dataset together 
main_df = male_df.append(female_df)

# drop race column 
main_df = main_df.drop('race', axis = 1)

main_df.tail(10)

Unnamed: 0,name,gender
15372,anjum,f
15373,miss reena,f
15374,pooja,f
15375,rakhi,f
15376,musarrat,f
15377,saroj devi,f
15378,naina @ geeta,f
15379,manju d/0 baboo lal jatav,f
15380,shivani,f
15381,nayna,f


In [7]:
# size of main_df dataset 
main_df.shape

(30227, 2)

In [8]:
# statistical information 
main_df.describe()

Unnamed: 0,name,gender
count,30172,30227
unique,15034,2
top,pooja,f
freq,353,15382


In [9]:
# the count of name and gender doesn't match
main_df = main_df.dropna()
main_df.describe()

Unnamed: 0,name,gender
count,30172,30172
unique,15034,2
top,pooja,f
freq,353,15351


In [10]:
# finding common names among people 
ax = plt.subplots(figsize = (20, 6))
ax = main_df['name'].value_counts().head(10).plot(kind = 'bar', width = .5)
ax.set_ylabel('Frequency')
ax.set_xlabel('Names')
for p in ax.patches:
    ax.annotate(format(p.get_height()), (p.get_x()+0.1, p.get_height()+1.0))
plt.show()

In [11]:
# Removing duplicate names from main_df 
main_df = main_df.drop_duplicates('name')
main_df.describe()

Unnamed: 0,name,gender
count,15034,15034
unique,15034,2
top,aanamika misra,m
freq,1,8519


In [11]:
# Preprocessing data 
# removing special characters and numbers from the data
import re 
import string 

def preprocess_name(x):
    x = x.lower()
    x = re.sub(r'[^\x00-\x7f]+',r'', x)
    x = re.sub("["+string.punctuation+"]", "", x)
    x = re.sub(r'[0-9]+', r'', x)
    x = x.strip()
    return x

In [12]:
main_df['name'] = main_df['name'].apply(preprocess_name)

In [14]:
# remove names having less than two characters
main_df = main_df[main_df.name.str.len() > 2]
main_df.describe()

Unnamed: 0,name,gender
count,14920,14920
unique,14830,2
top,km pooja,m
freq,3,8473


In [None]:
# Class Distibution of gender vs frequency
ax = plt.subplots(figsize = (12, 7))
ax = main_df['gender'].value_counts().plot(kind = 'bar', width = .4)
ax.set_ylabel("Frequency")
ax.set_xlabel('Gender')
for p in ax.patches:
    ax.annotate(format(p.get_height()), (p.get_x() +0.1, p.get_height()+1.0))
plt.show()

In [93]:
# Extract prefix/ firstnames from the names
main_df['firstname'] = main_df.name.apply(lambda x: x.strip().split(" ")[0])
main_df.firstname.value_counts().head(10)

smt       630
mohd      197
kumari    150
ram       109
km         93
md         77
pooja      59
ku         52
sanjay     48
ravi       47
Name: firstname, dtype: int64

In [110]:
# there are many more prefix used in indian names 
prefix = ['mr','kumar','kr','ku','kum','kumari','km',
          'miss','mrs','mohd','md',
          'sri','shri','sh','smt','shree','shrimati','su','sushri']

# depicting data whose firstname is a prefix 
df_with_prefix = main_df[main_df.firstname.isin(prefix)]
df_with_prefix.head(10)

Unnamed: 0,name,gender,firstname
34,md afsar,m,md
86,mohd ataullah,m,mohd
91,mohd shakib,m,mohd
97,md mustafa,m,md
105,mohd aakib,m,mohd
142,mohd khairul,m,mohd
194,mohd shahid,m,mohd
228,mohd mukhtaar,m,mohd
248,mohd kausar,m,mohd
311,mohd afzal,m,mohd


In [113]:
# removing whose firstname is prefix 
main_df_not_prefix = main_df[~main_df.firstname.isin(prefix)]
main_df_not_prefix.describe()

Unnamed: 0,name,gender,firstname
count,13545,13545,13545
unique,13538,2,6605
top,monika,m,ram
freq,2,8087,109


In [114]:
# dropping duplicate entries from firstname column 
main_df_not_prefix = main_df_not_prefix.drop_duplicates('firstname')


# dropping column name 
main_df_not_prefix = main_df_not_prefix.drop('name', axis = 1)

main_df_not_prefix.head()

Unnamed: 0,gender,firstname
0,m,barjraj
1,m,ramdin
2,m,sharat
3,m,birender
4,m,amit


In [115]:
# Processed dataset 
main_df_not_prefix.to_csv("names_processed.csv", index = False)

In [13]:
cleanedData = pd.read_csv('names_processed.csv', header =0)

In [14]:
#No. of vowels
def countVowels(string):
    num_vowels=0
    for char in string:
        if char in "aeiou":
           num_vowels = num_vowels+1
    return num_vowels

In [15]:
countVowels('Lakshya')

2

In [16]:
#Total Number of 'e', 'i' in the name
def countEI(string):
    num_ei=0
    for char in string:
        if char in "ei":
           num_ei = num_ei+1
    return num_ei

In [17]:
countEI('Lakshya')

0

In [18]:
def _nameFeatures(name):
    name=name.lower()
    return{'firstname':name,
        'lastChar':name[-1],
          'lastTwoChar':name[-2:],
          'isLastAEIY':(name[-1] in 'aeiy'),
          'NoOfVowels':countVowels(name),
           'NumEI':countEI(name),
           'length':len(name),
           'firstChar':name[0]
           }


In [19]:
_nameFeatures('Lakshya')

{'NoOfVowels': 2,
 'NumEI': 0,
 'firstChar': 'l',
 'firstname': 'lakshya',
 'isLastAEIY': True,
 'lastChar': 'a',
 'lastTwoChar': 'ya',
 'length': 7}

In [20]:
def extractFeatures(dataframe):
    featureSet = list()
    for index,row in dataframe.iterrows():
        featureSet.append((_nameFeatures(row['firstname']),row['gender']))
    return featureSet

In [21]:
from nltk import NaiveBayesClassifier,classify
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
import random

In [22]:
def GenderPredictor(name):
    cleanedData = pd.read_csv('names_processed.csv', header =0)
    featureSet = extractFeatures(cleanedData)
    random.shuffle(featureSet)
    classif = SklearnClassifier(LinearSVC())
    classifier = classif.train(featureSet)
    Feats = _nameFeatures(name)
    return classifier.classify(Feats)

In [23]:
def TrainAndTestNB(dataframe):
    featureSet = extractFeatures(dataframe)
    random.shuffle(featureSet)
    name_count = len(featureSet)
    cut = int(name_count*0.80)
    trainSet = featureSet[:cut]
    testSet = featureSet[cut:]
    clf = NaiveBayesClassifier.train(trainSet)
    print('Testing Accuracy: {} '.format(classify.accuracy(clf,testSet)))
    print('Most Informative Features')
    print(clf.show_most_informative_features(5))



In [24]:
def TrainAndTestSVM(dataframe):
    featureSet = extractFeatures(dataframe)
    random.shuffle(featureSet)
    name_count = len(featureSet)
    cut = int(name_count*0.80)
    trainSet = featureSet[:cut]
    testSet = featureSet[cut:]
    clf = SklearnClassifier(LinearSVC())
    classifier = clf.train(trainSet)
    print('Testing Accuracy: {} '.format(classify.accuracy(classifier,testSet)))

In [30]:
GenderPredictor('Narendra')

'm'

In [31]:
TrainAndTestNB(cleanedData)

Testing Accuracy: 0.787282361847 
Most Informative Features
Most Informative Features
                lastChar = 'v'                 m : f      =     22.7 : 1.0
                lastChar = 'd'                 m : f      =     19.3 : 1.0
             lastTwoChar = 'th'                m : f      =     14.7 : 1.0
             lastTwoChar = 'nt'                m : f      =     14.2 : 1.0
             lastTwoChar = 'ir'                m : f      =     14.1 : 1.0
None


In [32]:
TrainAndTestSVM(cleanedData)

Testing Accuracy: 0.81226343679 
