# Gender Prediction by Name

In [86]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [87]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [209]:
train = pd.read_excel("/content/drive/Shareddrives/tk1-anamedsos/df_training_labeled_structured.xlsx")

In [210]:
train = train[['name', 'gender']]

In [211]:
additional = pd.read_csv("/content/drive/Shareddrives/tk1-anamedsos/name_gender.csv")

In [212]:
def gender_converter(gender):
  gender = gender.strip().lower()
  if gender == 'm' or gender == "lk":
    return 'pria'
  else:
    return 'wanita'

In [213]:
additional['gender'].value_counts()

m     1092
f      865
 m       1
LK       1
P        1
Name: gender, dtype: int64

In [214]:
additional['gender'] = additional['gender'].apply(gender_converter)

In [215]:
additional.gender.value_counts()

pria      1094
wanita     866
Name: gender, dtype: int64

In [216]:
df = pd.concat([train, additional], ignore_index=True)

In [217]:
df.dropna(inplace=True)

In [218]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12825 entries, 0 to 12826
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    12825 non-null  object
 1   gender  12825 non-null  object
dtypes: object(2)
memory usage: 300.6+ KB


In [219]:
# df.to_csv("/content/drive/Shareddrives/tk1-anamedsos/df_name.csv")

In [220]:
import re
def clean_name(name):
  name_clean = re.sub(r"\d+", " ", name)
  name_clean = name_clean.lower()
  name_clean = re.sub(r"[^\w\s]", " ", name_clean)
  name_clean = re.sub(r"\s+", " ", name_clean)
  name_clean = name_clean.strip()
  return name_clean

In [221]:
names = df['name'].tolist()

In [222]:
names = [clean_name(name) for name in tqdm(names)]

100%|██████████| 12825/12825 [00:00<00:00, 180753.73it/s]


In [223]:
df['name'] = names


In [224]:
# df.to_csv("/content/drive/Shareddrives/tk1-anamedsos/df_name.csv", index=False)

<hr>

In [225]:
df = pd.read_csv("/content/drive/Shareddrives/tk1-anamedsos/df_name.csv")
df

Unnamed: 0,name,gender
0,atika mutiara,wanita
1,lolita agustine,wanita
2,henry manampiring,pria
3,lukman h saifuddin,pria
4,ismail fahmi,pria
...,...,...
12820,hadimah,wanita
12821,aprillia rahma wati,wanita
12822,muhamad khoirunisa m,pria
12823,sukandi,pria


In [226]:
from sklearn.model_selection import train_test_split

In [227]:
df.dropna(inplace=True)

In [229]:
df = df[df['gender'] != '-']

In [230]:
train, val = train_test_split(df, test_size=0.3, random_state=123)

In [231]:
X_train = train['name']
y_train = train['gender']

In [232]:
vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=2000)

In [233]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12593 entries, 0 to 12824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    12593 non-null  object
 1   gender  12593 non-null  object
dtypes: object(2)
memory usage: 295.1+ KB


In [234]:
X_train = vectorizer.fit_transform(np.array(X_train)).todense()

In [235]:
X_val = val['name']
y_val = val['gender']

In [236]:
X_val = vectorizer.transform(np.array(X_val)).todense()

In [237]:
print(X_train.shape)
print(X_val.shape)

(8815, 2000)
(3778, 2000)


In [238]:
from sklearn.svm import SVC, LinearSVC

svc = SVC(kernel="rbf")
svc.fit(X_train, y_train)

SVC()

In [239]:
from sklearn.metrics import classification_report, f1_score

In [240]:
y_pred = svc.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

        pria       0.81      0.57      0.67      1891
      wanita       0.67      0.87      0.75      1887

    accuracy                           0.72      3778
   macro avg       0.74      0.72      0.71      3778
weighted avg       0.74      0.72      0.71      3778



In [241]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


rf = RandomForestClassifier(class_weight='balanced')
rf.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced')

In [242]:
y_pred_rf = rf.predict(X_val)
print(classification_report(y_val, y_pred_rf))

              precision    recall  f1-score   support

        pria       0.83      0.56      0.67      1891
      wanita       0.67      0.88      0.76      1887

    accuracy                           0.72      3778
   macro avg       0.75      0.72      0.71      3778
weighted avg       0.75      0.72      0.71      3778



Test Data

In [243]:
test = pd.read_excel("/content/drive/Shareddrives/tk1-anamedsos/df_testing_structured.xlsx")

In [244]:
test

Unnamed: 0,id,created_at,description,followers_count,following_count,listed_count,location,name,tweet_count,username,verified
0,1.0,2017-05-12,A place to record the insights I receive on th...,94952.0,427.0,59.0,Inquiry: aisykaspol@gmail.com,‏َ,26004.0,lilithkis,0.0
1,2.0,2012-03-09,Official Twitter of Mario Teguh. \nAdmin@exnal...,9181064.0,0.0,3007.0,"Jakarta, Indonesia",Mario Teguh,55031.0,marioteguh,1.0
2,3.0,2019-11-15,Apprentice Python programmer :)\n\nhttps://t.c...,25.0,217.0,1.0,,Florentin Anggraini Purnama,51.0,flo_and_behold,0.0
3,4.0,2013-04-10,A proud Indonesian. Bangga Berbangsa.,1049638.0,309.0,402.0,Indonesia,Gita Wirjawan,17653.0,gwirjawan,1.0
4,5.0,2012-01-11,Chairman MNC Group | Ketum Partai Perindo | Ke...,551980.0,374.0,375.0,Jakarta Capital Region,Hary Tanoesoedibjo,9185.0,hary_tanoe,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1049,1050.0,2013-11-16,"S.W.I.M.M.E.R || KA bandung,indonesia",75.0,261.0,0.0,,farrel tangkas,18.0,farreltangkas,0.0
1050,1051.0,2011-07-25,IG: @fithrisyamsu,2706.0,148.0,4.0,,Fithri Syamsu,6085.0,fithrisyamsu,0.0
1051,1052.0,2021-06-28,Jonathan Xavier Hartono - Class of 2022 - Golf...,1.0,17.0,0.0,"Jakarta Capital Region, Indone",Jonathan Xavier Hartono,1.0,jojoxh_,0.0
1052,1053.0,2012-05-06,seethegooder.,1072.0,710.0,1.0,Indonesia,Dewi Putri Sungging,9892.0,dpsungging,0.0


In [245]:
test = test[['name']]

In [246]:
test

Unnamed: 0,name
0,‏َ
1,Mario Teguh
2,Florentin Anggraini Purnama
3,Gita Wirjawan
4,Hary Tanoesoedibjo
...,...
1049,farrel tangkas
1050,Fithri Syamsu
1051,Jonathan Xavier Hartono
1052,Dewi Putri Sungging


In [247]:
names = test['name'].tolist()

In [248]:
names = [clean_name(name) for name in names]

In [249]:
test['name'] = names

In [250]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    1054 non-null   object
dtypes: object(1)
memory usage: 8.4+ KB


In [251]:
test.fillna(' ', inplace=True)

In [252]:
X_test = vectorizer.transform(np.array(test['name'])).todense()

In [253]:
y_test = svc.predict(X_test)

In [254]:
y_test

array(['wanita', 'pria', 'pria', ..., 'pria', 'wanita', 'wanita'],
      dtype=object)

In [255]:
df_test = pd.DataFrame(y_test)

In [256]:
df_test.index = range(1,len(df_test)+1)

In [257]:
df_test

Unnamed: 0,0
1,wanita
2,pria
3,pria
4,pria
5,wanita
...,...
1050,wanita
1051,wanita
1052,pria
1053,wanita


In [258]:
df_test.to_csv("resultGenderByName.csv", header=False)

In [261]:
additional['gender'].value_counts()

pria      1094
wanita     866
Name: gender, dtype: int64