### 1 - Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Pipeline
from sklearn.pipeline import Pipeline

## Encoding, Decoding, Text Transformation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## Train Test Split
from sklearn.model_selection import train_test_split

## Grid Search CV
from sklearn.model_selection import GridSearchCV

## Models
from sklearn.neighbors import KNeighborsClassifier                     # KNN
from sklearn.linear_model import LogisticRegression                    # Logistic Regression
from sklearn.svm import SVC                                            # Support Vector Machine
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB # Navie Bayes

## Evaluation
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

### 2 - Understanding the Data

In [None]:
## Reading data from text files as CSV
df_1 = pd.read_csv('sorted females.txt', names=['Name'])
df_2 = pd.read_csv('sorted males.txt', names=['Name'])

## Creating gender column 
df_1['Gender'] = '0'                    # 0 for female
df_2['Gender'] = '1'                    # 1 for male

## Combining two dataframe
df = pd.concat([df_1,df_2], axis=0)

print(df.sample(5))
print('\nRows & Columns : ', df.shape)

### 3 - Data-cleaning and preprocessing

In [None]:
## Checing and Handling Null
print('Null :\n' ,df.isnull().sum())
df = df.dropna()

## Checking and Handling Duplicates
df.drop_duplicates(inplace = True)
print('\nRows and Columns : ',df.shape)

## Lower-case Convertion
df.iloc[::,:-1] = df.iloc[::,:-1].apply(lambda x: x.str.lower())
df.sample(3)

## feature Selection
y = df.Gender.values
X = df.Name.values

## Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state=42)

### 4 - Model Development and evaluation

##### 4 . 1 - KNN

In [None]:
## Pipeline
steps = [('vect', CountVectorizer(ngram_range=(1,2))),
        ('tfidf', TfidfTransformer()),
         ('knn', KNeighborsClassifier(n_neighbors = 3))]

pipeline = Pipeline(steps)

## Grid Search CV
parameters = {'knn__n_neighbors': [3,5,7,9,11,13,15,17,19]}
knn_pipeline = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', n_jobs=-1)

## Fitting
knn_pipeline.fit(X_train, y_train)

## Best parameter
print("Best parameters:", knn_pipeline.best_params_)

## Evaluation
## Training
print('KNN')
print('---Training Results---')
ypred_train = knn_pipeline.predict(X_train)
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)
print(mat_clf_train)
print(report_clf_train)
# ROC
ypred_trainP = knn_pipeline.predict_proba(X_train)
auc_train_knn = roc_auc_score(y_train, ypred_trainP[:,1])
print(auc_train_knn)

## Testing
print('\n---Testing Results---')
ypred_test = knn_pipeline.predict(X_test)
mat_clf_test = confusion_matrix(y_test, ypred_test)
report_clf_test = classification_report(y_test, ypred_test)
print(mat_clf_test)
print(report_clf_test)
# ROC
ypred_testP = knn_pipeline.predict_proba(X_test)
auc_test_knn = roc_auc_score(y_test, ypred_testP[:,1])
print(auc_test_knn)

## storing f1, precison, recall scores for comparison
# f1
f1_test_0_knn = f1_score(y_test, ypred_test, pos_label='0') # 0 = Female
f1_test_1_knn = f1_score(y_test, ypred_test, pos_label='1') # 1 = Male
# precision
precision_test_0_knn = precision_score(y_test, ypred_test, pos_label='0')
precision_test_1_knn = precision_score(y_test, ypred_test, pos_label='1')
# recall
recall_test_0_knn = recall_score(y_test, ypred_test, pos_label='0')
recall_test_1_knn = recall_score(y_test, ypred_test, pos_label='1')
# accuracy
accuracy_test_knn = accuracy_score(y_test, ypred_test)

##### 4 . 2 - Logistic Regression

In [None]:
## Pipeline
steps = [('vect', CountVectorizer(ngram_range=(1,2))),
         ('tfidf', TfidfTransformer()),
         ('logReg', LogisticRegression(penalty = "l2", C = 10))]

pipeline = Pipeline(steps)

## Grid Search CV
para = {'logReg__C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.00]}
lr_pipeline = GridSearchCV(pipeline, para, cv=5, scoring='roc_auc', n_jobs=-1)

## Fitting
lr_pipeline.fit(X_train, y_train)

## Best Parameter
print("\nBest parameters:", lr_pipeline.best_params_)

## Evaluation
##Training
print('Logistic Regression')
print('---Training Results---')
ypred_train = lr_pipeline.predict(X_train)
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)
print(mat_clf_train)
print(report_clf_train)
# ROC
ypred_trainP = lr_pipeline.predict_proba(X_train)
auc_train_lr = roc_auc_score(y_train, ypred_trainP[:,1])
print(auc_train_lr)

## Testing
print('\n---Testing Results---')
ypred_test = lr_pipeline.predict(X_test)
mat_clf_test = confusion_matrix(y_test, ypred_test)
report_clf_test = classification_report(y_test, ypred_test)
print(mat_clf_test)
print(report_clf_test)
# ROC
ypred_testP = lr_pipeline.predict_proba(X_test)
auc_test_lr = roc_auc_score(y_test, ypred_testP[:,1])
print(auc_test_lr)

## Storing f1, precison, recall scores for comparison
# f1
f1_test_0_lr = f1_score(y_test, ypred_test, pos_label='0')
f1_test_1_lr = f1_score(y_test, ypred_test, pos_label='1')
# precision
precision_test_0_lr = precision_score(y_test, ypred_test, pos_label='0')
precision_test_1_lr = precision_score(y_test, ypred_test, pos_label='1')
# recall
recall_test_0_lr = recall_score(y_test, ypred_test, pos_label='0')
recall_test_1_lr = recall_score(y_test, ypred_test, pos_label='1')
# accuracy
accuracy_test_lr = accuracy_score(y_test, ypred_test)

##### 4 . 3 - Support Vector Machine

In [None]:
## Pipeline
steps = [('vect', CountVectorizer(ngram_range=(1,2))),
        ('tfidf', TfidfTransformer()),

         ## linear SVC
         ('svc', SVC(kernel = 'linear',
                     class_weight='balanced',probability=True))

         ## poly SVC
         #('svc', SVC(kernel = 'poly', degree = 5,
         #            class_weight='balanced', probability=True))

         ## RBF SVC
         #('svc', SVC(kernel = 'rbf', gamma = 'scale',
         #            class_weight='balanced', probability=True))
        ]
svc_pipeline = Pipeline(steps)

## Fitting
svc_pipeline.fit(X_train, y_train)


## Evaluation
## Training
print('Support Vector Machine')
print('---Training Results---')
ypred_train = svc_pipeline.predict(X_train)
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)
print(mat_clf_train)
print(report_clf_train)
# ROC
ypred_trainP = svc_pipeline.predict_proba(X_train)
auc_train_svc = roc_auc_score(y_train, ypred_trainP[:,1])
print(auc_train_svc)

## Testing
print('\n---Testing Results---')
ypred_test = svc_pipeline.predict(X_test)
mat_clf_test = confusion_matrix(y_test, ypred_test)
report_clf_test = classification_report(y_test, ypred_test)
print(mat_clf_test)
print(report_clf_test)
# ROC
ypred_testP = svc_pipeline.predict_proba(X_test)
auc_test_svc = roc_auc_score(y_test, ypred_testP[:,1])
print(auc_test_svc)


## storing f1, precison, recall scores for comparison
#f1
f1_test_0_svc = f1_score(y_test, ypred_test, pos_label='0')
f1_test_1_svc = f1_score(y_test, ypred_test, pos_label='1')
#precision
precision_test_0_svc = precision_score(y_test, ypred_test, pos_label='0')
precision_test_1_svc = precision_score(y_test, ypred_test, pos_label='1')
# recall
recall_test_0_svc = recall_score(y_test, ypred_test, pos_label='0')
recall_test_1_svc = recall_score(y_test, ypred_test, pos_label='1')
# accuracy
accuracy_test_svc = accuracy_score(y_test, ypred_test)

##### 4 . 4 - Navis Bayes

In [None]:
## Pipeline
steps = [('vect', CountVectorizer(ngram_range=(1,2))),
        ('tfidf', TfidfTransformer()),

        ## GaussianNB
        #('gnb' , GaussianNB()),

        ## MultinomialNB
        ('mnb' , MultinomialNB(alpha=0.1)),

        ## BernoulliNB
        #('bnb' , BernoulliNB(alpha=1.0)),
        ]
pipeline = Pipeline(steps)

## Grid Search CV
para = {
    'mnb__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    #'bnb__alpha': [0.01, 0.1, 1.0, 10.0],
       }
nb_pipeline = GridSearchCV(pipeline, para, cv=5, scoring='accuracy', n_jobs=-1)

## Fitting
nb_pipeline.fit(X_train, y_train)

## Best Parameter
print("\nBest parameters:", nb_pipeline.best_params_)

## Evaluation
## Training
print('Navis Bayes')
print('---Training Results---')
ypred_train = nb_pipeline.predict(X_train)
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)
print(mat_clf_train)
print(report_clf_train)
# ROC
ypred_trainP = nb_pipeline.predict_proba(X_train)
auc_train_nb = roc_auc_score(y_train, ypred_trainP[:,1])
print(auc_train_nb)

## Testing
print('\n---Testing Results---')
ypred_test = nb_pipeline.predict(X_test)
mat_clf_test = confusion_matrix(y_test, ypred_test)
report_clf_test = classification_report(y_test, ypred_test)
print(mat_clf_test)
print(report_clf_test)
# ROC
ypred_testP = nb_pipeline.predict_proba(X_test)
auc_test_nb = roc_auc_score(y_test, ypred_testP[:,1])
print(auc_test_nb)


## Storing f1, precison, recall scores for comparison
#f1
f1_test_0_nb = f1_score(y_test, ypred_test, pos_label='0')
f1_test_1_nb = f1_score(y_test, ypred_test, pos_label='1')
#precision
precision_test_0_nb = precision_score(y_test, ypred_test, pos_label='0')
precision_test_1_nb = precision_score(y_test, ypred_test, pos_label='1')
# recall
recall_test_0_nb = recall_score(y_test, ypred_test, pos_label='0')
recall_test_1_nb = recall_score(y_test, ypred_test, pos_label='1')
# accuracy
accuracy_test_nb = accuracy_score(y_test, ypred_test)

### 5 - Model comparison

In [None]:
## ROC Scores
# Training
y=['KNN', 'Logistic Regression', 'Support Vector Machine', 'Naivs Bayes']
x=[auc_train_knn, auc_train_lr, auc_train_svc, auc_train_nb]

plt.figure(figsize=(5, 2))
plt.barh(y, x, color='darkblue', height = 0.5)
plt.title("ROC Score Comparison (Training Data)")
plt.show()

# Testing
y=['KNN', 'Logistic Regression', 'Support Vector Machine', 'Naivs Bayes']
x=[auc_test_knn, auc_test_lr, auc_test_svc, auc_test_nb]

plt.figure(figsize=(5, 2))
plt.barh(y, x, color='darkblue', height = 0.5)
plt.title("ROC Score Comparison (Testing Data)")
plt.show()



In [None]:
## f1 scores
plotdata = pd.DataFrame({
    "Female":[f1_test_0_knn,f1_test_0_lr,f1_test_0_svc,f1_test_0_nb],
    "Male":[f1_test_1_knn,f1_test_1_lr,f1_test_1_svc,f1_test_1_nb]},
    index=["KNN", "LR", "SVM", "NB"])

plotdata.plot(kind="bar",figsize=(6, 3), color=['Orange','Blue'])
plt.legend(loc='lower left')
plt.title("F1-Score Comparison(Testing)")
plt.xlabel("Testing")
plt.ylabel("F1-Score")
plt.show()
print('\n-----------\n')

## Precision
plotdata = pd.DataFrame({
    "Female":[precision_test_0_knn,precision_test_0_lr,precision_test_0_svc,precision_test_0_nb],
    "Male":[precision_test_1_knn,precision_test_1_lr,precision_test_1_svc,precision_test_1_nb]},
    index=["KNN", "LR", "SVM", "NB"])

plotdata.plot(kind="bar",figsize=(6, 3), color=['Orange','Blue'])
plt.legend(loc='lower left')
plt.title("Precision Comparison(Testing)")
plt.xlabel("Testing")
plt.ylabel("Precision")
plt.show()
print('\n-----------\n')

## Recall
plotdata = pd.DataFrame({
    "Female":[recall_test_0_knn,recall_test_0_lr,recall_test_0_svc,recall_test_0_nb],
    "Male":[recall_test_1_knn,recall_test_1_lr,recall_test_1_svc,recall_test_1_nb]},
    index=["KNN", "LR", "SVM", "NB"])

plotdata.plot(kind="bar",figsize=(6, 3), color=['Orange','Blue'])
plt.legend(loc='lower left')
plt.title("Recall Comparison(Testing)")
plt.xlabel("Testing")
plt.ylabel("Recall")
plt.show()

In [None]:
## Accuracy
y = ['KNN', 'LR', 'SVM', 'NB']
x = [accuracy_test_knn, accuracy_test_lr, accuracy_test_svc, accuracy_test_nb,]
plt.figure(figsize=(5, 3))
colors = ['skyblue', 'salmon', 'limegreen', 'orange']
plt.bar(y, x, color=colors, width=0.5)
plt.xlabel("Testing")
plt.ylabel("Accuracy Score")
plt.title("Accuracy (Testing Data)")
for i, v in enumerate(x):
    plt.text(i, v, f"{v:.2f}", ha='center', va='top')
plt.show()

### 6 - Check Your Name Here

In [None]:
## Enter a Name
name = input('Enter a name : ')
name = name.lower()

## Predict
prediction_knn = nb_pipeline.predict([name])
prediction_lr = lr_pipeline.predict([name])
prediction_svc = svc_pipeline.predict([name])
prediction_nb = nb_pipeline.predict([name])

gender_knn = pd.Series(prediction_knn).map({'1': 'Male', '0': 'Female'}).to_string().split()[1]
gender_lr = pd.Series(prediction_lr).map({'1': 'Male', '0': 'Female'}).to_string().split()[1]
gender_svc = pd.Series(prediction_svc).map({'1': 'Male', '0': 'Female'}).to_string().split()[1]
gender_nb = pd.Series(prediction_nb).map({'1': 'Male', '0': 'Female'}).to_string().split()[1]

print('KNN : ', gender_knn)
print('LR  : ', gender_lr)
print('SVC : ', gender_svc)
print('NB  : ', gender_nb)