## Importing required libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
import seaborn as sns

In [3]:
import matplotlib.pyplot as plt

In [4]:
%matplotlib inline

## Loading Data

In [5]:
male_names = pd.read_csv("Indian-Male-Names.csv")

In [6]:
female_names = pd.read_csv("Indian-Female-Names.csv")

## Preprocessing Data

In [7]:
def clean_names(x):
    if isinstance(x, str):
        if len(x.split(" ")) > 2:
            return x.split(" ")[1].lower()
        else:
            if len(x.split(" ")[0]) < 3:
                return np.nan
            else:
                return x.split(" ")[0].lower()
    else:
        return np.nan

In [8]:
def remove_spaces(x):
    if isinstance(x, str):
        x = x.strip()
        if x:
            return x
        else:
            return np.nan
    else:
        np.nan

In [9]:
def remove_non_english(x):
    if isinstance(x, str):
        if all(ord(char) < 128 for char in x):
            return x
        else:
            return np.nan
    else:
        np.nan

In [10]:
def remove_special_charnames(x):
    if isinstance(x, str):
        valid = set('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')
        if set(x).issubset(valid):
            return x
        else:
            return np.nan
    else:
        return np.nan

In [11]:
male_names["name"] = male_names["name"].apply(clean_names)
male_names["name"] = male_names["name"].apply(remove_spaces)
male_names["name"] = male_names["name"].apply(remove_non_english)
male_names["name"] = male_names["name"].apply(remove_special_charnames)
male_names = male_names.drop_duplicates()

male_names = male_names.dropna()

In [12]:
female_names["name"] = female_names["name"].apply(clean_names)
female_names["name"] = female_names["name"].apply(remove_spaces)
female_names["name"] = female_names["name"].apply(remove_non_english)
female_names["name"] = female_names["name"].apply(remove_special_charnames)
female_names = female_names.drop_duplicates()

female_names = female_names.dropna()

In [13]:
len(male_names), len(female_names)

(3754, 3063)

In [14]:
names = pd.concat([male_names, female_names], ignore_index=True)

In [15]:
len(names)

6817

In [16]:
names.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin,m,indian
2,sharat,m,indian
3,birender,m,indian
4,amit,m,indian


## Adding Features to existing data

In [17]:
names = names.drop(["race"], axis=1)

In [18]:
def first_char(x):
    try:
        return x[0]
    except:
        print(x)

In [19]:
def last_char(x):
    try:
        return x[-1]
    except:
        print(x)

In [20]:
names["first_char"] = names["name"].apply(first_char)

In [21]:
names["last_char"] = names["name"].apply(last_char)

In [22]:
def count_vowels(x):
    count=0
    for vowel in "aeiou":
        count+=x.count(vowel)
    return count

In [23]:
names["vowel_counts"]=names["name"].apply(count_vowels)

In [24]:
np.sort(names["first_char"].unique())

array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z'], dtype=object)

In [25]:
np.sort(names["last_char"].unique())

array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'], dtype=object)

In [26]:
def name_has_a(x):
    if "a" in x:
        return 1
    else:
        return 0
def name_has_e(x):
    if "e" in x:
        return 1
    else:
        return 0
def name_has_i(x):
    if "i" in x:
        return 1
    else:
        return 0
def name_has_o(x):
    if "o" in x:
        return 1
    else:
        return 0
def name_has_u(x):
    if "u" in x:
        return 1
    else:
        return 0

In [27]:
names["has_a"] = names["name"].apply(name_has_a)
names["has_e"] = names["name"].apply(name_has_e)
names["has_i"] = names["name"].apply(name_has_i)
names["has_o"] = names["name"].apply(name_has_o)
names["has_u"] = names["name"].apply(name_has_u)

In [28]:
def name_length(x):
    return len(x)

In [29]:
names["name_length"] = names["name"].apply(name_length)

In [30]:
names = names[names["name_length"] >= 4]

In [31]:
names = names.drop_duplicates('name')

In [32]:
names.head()

Unnamed: 0,name,gender,first_char,last_char,vowel_counts,has_a,has_e,has_i,has_o,has_u,name_length
0,barjraj,m,b,j,2,1,0,0,0,0,7
1,ramdin,m,r,n,2,1,0,1,0,0,6
2,sharat,m,s,t,2,1,0,0,0,0,6
3,birender,m,b,r,3,0,1,1,0,0,8
4,amit,m,a,t,2,1,0,1,0,0,4


In [33]:
X = names.drop(["name", "gender", "first_char","last_char"], axis=1)

In [34]:
X.head()

Unnamed: 0,vowel_counts,has_a,has_e,has_i,has_o,has_u,name_length
0,2,1,0,0,0,0,7
1,2,1,0,1,0,0,6
2,2,1,0,0,0,0,6
3,3,0,1,1,0,0,8
4,2,1,0,1,0,0,4


In [35]:
X.shape

(6211, 7)

In [36]:
y = names["gender"]

## Creating Machine Learning model

In [37]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier



In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

### Random Forest

In [39]:
clf = RandomForestClassifier()

In [40]:
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

### Using accuracy_score as metric, we could have used Precision or Recall depending on our needs

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
print("Random Forest accuracy on test set: " + str(accuracy_score(y_test, clf.predict(X_test))))

Random Forest accuracy on test set: 0.6600128783


In [43]:
from sklearn.grid_search import GridSearchCV



In [44]:
new_clf = RandomForestClassifier()

## Optimzing hyper parameters using Grid Search

In [45]:
parameters = {"n_estimators": [10,50,100,200,300,400,500]}

In [46]:
grid = GridSearchCV(estimator=new_clf, cv=5, n_jobs=-1, param_grid=parameters)

In [47]:
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 50, 100, 200, 300, 400, 500]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [48]:
grid.best_estimator_.feature_importances_

array([ 0.33469584,  0.03485317,  0.05564254,  0.10222888,  0.03947877,
        0.04243881,  0.39066199])

In [49]:
grid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [50]:
print("Optimized Random Forest accuracy on test set: " + str(accuracy_score(y_test, grid.predict(X_test))))

Optimized Random Forest accuracy on test set: 0.660656793303


In [51]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV

### Trying Logistic Regression and Support Vector Machines

In [52]:
svm = SVC()
lr = LogisticRegressionCV(n_jobs=-1)

In [53]:
svm.fit(X_train,y_train)
lr.fit(X_train,y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [54]:
print("SVM accuracy on test set: " + str(accuracy_score(y_test, svm.predict(X_test))))

SVM accuracy on test set: 0.663876368319


In [55]:
print("SVM accuracy on test set: " + str(accuracy_score(y_test, lr.predict(X_test))))

SVM accuracy on test set: 0.662588538313


## Creating more features
#### One hot encoding first_char and last_char

In [56]:
new_names = names.copy()

In [57]:
new_names_first_char = pd.get_dummies(new_names['first_char'], prefix="is_first_char_")
new_names_last_char = pd.get_dummies(new_names['last_char'], prefix="is_last_char_")

In [58]:
new_names = pd.concat([new_names, new_names_first_char, new_names_last_char], axis=1)

In [59]:
new_names.head()

Unnamed: 0,name,gender,first_char,last_char,vowel_counts,has_a,has_e,has_i,has_o,has_u,...,is_last_char__q,is_last_char__r,is_last_char__s,is_last_char__t,is_last_char__u,is_last_char__v,is_last_char__w,is_last_char__x,is_last_char__y,is_last_char__z
0,barjraj,m,b,j,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ramdin,m,r,n,2,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,sharat,m,s,t,2,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,birender,m,b,r,3,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
4,amit,m,a,t,2,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [60]:
new_x = new_names.drop(["name", "gender", "first_char","last_char"], axis=1)
y = new_names["gender"]
xtrain, xtest, ytrain,ytest = train_test_split(new_x, y)

## SVM and Logistic Regression on new modified data

In [61]:
svm2 = SVC()
svm2.fit(xtrain,ytrain)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [62]:
print("SVM accuracy on test set: " + str(accuracy_score(ytest, svm2.predict(xtest))))

SVM accuracy on test set: 0.773341918867


In [63]:
lr_new = LogisticRegressionCV(n_jobs=-1)
lr_new.fit(xtrain,ytrain)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [64]:
print("Logistic Regression accuracy on test set: " + str(accuracy_score(ytest, lr_new.predict(xtest))))

Logistic Regression accuracy on test set: 0.775273663876


In [65]:
rf = RandomForestClassifier()
parameters = {"n_estimators": [10,50,100,200,300,400,500]}
new_grid = GridSearchCV(estimator=rf, cv=5, n_jobs=-1, param_grid=parameters)
new_grid.fit(xtrain,ytrain)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 50, 100, 200, 300, 400, 500]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [66]:
new_grid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [67]:
print("Optimized RF cross val score: " + str(new_grid.best_score_))

Optimized RF cross val score: 0.7855302705023616


In [68]:
svm = SVC()
svm_param = {"C":[0.1,0.5,1,1.5,2,5],"kernel":["linear","rbf"]}
svm_grid = GridSearchCV(estimator=svm, param_grid=svm_param, cv=3, n_jobs=-1)
svm_grid.fit(xtrain,ytrain)

GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [0.1, 0.5, 1, 1.5, 2, 5], 'kernel': ['linear', 'rbf']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [69]:
print("Optimized SVM cross val score on new modified data: " + str(svm_grid.best_score_))

Optimized SVM cross val score on new modified data: 0.802490339201374


In [73]:
print("Optimized SVM accuracy on test set: " + str(accuracy_score(ytest,svm_grid.predict(xtest))))

Optimized SVM accuracy on test set: 0.775273663876


In [70]:
svm_grid.best_estimator_

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## Result: We achived accuracy of around 78% using SVM. SVM performed better than other algorithms because in general SVM performs well for high dimensional data.