# Gender Prediction by Name and Description

In [4]:
!pip install --upgrade --no-cache-dir gdown
!pip install PySastrawi
!pip install sentence-transformers
!pip install sklearn-pycrfsuite

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[K     |████████████████████████████████| 210 kB 5.1 MB/s 
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 4.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 11.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 42.5 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |█████████████████████████

In [5]:
!pip install nltk
!pip install Sastrawi
!pip install python-crfsuite

# Data Wrangling
import pandas as pd
import numpy as np
import zipfile
import nltk
import csv
import re
import os

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data pre-processing and modeling.
import Sastrawi
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from nltk.tag import CRFTagger
from collections import Counter
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
nltk.download('punkt')

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 5.3 MB/s 
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Collecting python-crfsuite
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[K     |████████████████████████████████| 965 kB 5.3 MB/s 
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.8
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Pre-process

In [25]:
# Stopwords 
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=1JhzvbijTZ3BlZN_xI_w2UjyXkwjX9r5A' -O stopwords.csv

# Slang dan singkatan
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=19NOzXA8Voturopg_DTuiMns3s4M2IAUz' -O kamus_singkatan.csv
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=1VjgivEr1pxyRCuyhVifPnaReFz0yd8Us' -O colloquial-indonesian-lexicon.csv

In [26]:
def clean_tweet(tweet):
  clean_tw = re.sub(r"\d+", " ", tweet) # remove digit
  clean_tw = clean_tw.lower()
  clean_tw = re.sub("url", " ", clean_tw)
  clean_tw = re.sub("username", " ", clean_tw)
  clean_tw = re.sub('rt', " ", clean_tw)
  clean_tw = re.sub("sensitive-no", " ", clean_tw)
  clean_tw = re.sub("sep", " ", clean_tw)
  clean_tw = re.sub(r"https?:\/[^\s]*", " ", clean_tw) # remove http tag
  clean_tw = re.sub(r"[^\w\s]", " ", clean_tw)
  clean_tw = re.sub(r"#[^\s]*", " ", clean_tw)
  clean_tw = re.sub(r"@[^\s]*", " ", clean_tw)
  normal_regex = re.compile(r"(.)\1{1,}")
  clean_tw = normal_regex.sub(r"\1\1", clean_tw) # remove consecutive word
  clean_tw = re.sub(r"\s+", " ", clean_tw) # remove excessive space
  clean_tw = clean_tw.strip()
  return clean_tw

In [27]:
df_kamus_singkatan = pd.read_csv('kamus_singkatan.csv')
df_kamus_alay = pd.read_csv('colloquial-indonesian-lexicon.csv')

In [28]:
def remove_stopwords(tweet, stopwords):
  tokens = nltk.word_tokenize(tweet)
  tokens_after = []
  for token in tokens:
    if token in df_kamus_singkatan['singkatan'].values:
      token = df_kamus_singkatan.loc[df_kamus_singkatan['singkatan'] == token, 'asli'].values[0]              
    if token in df_kamus_alay['slang'].values:
      token = df_kamus_alay.loc[df_kamus_alay['slang'] == token, 'formal'].values[0]
    if token not in stopwords:
      tokens_after.append(token)
  clean_tw = " ".join(tokens_after)
  return clean_tw

In [29]:
def stemming(tweet, stemmer):
  tokens = nltk.word_tokenize(tweet)
  stemmed_tokens = []
  for token in tokens:
    stemmed_token = stemmer.stem(token)
    stemmed_tokens.append(stemmed_token)
  stemmed_tweet = " ".join(stemmed_tokens)
  return stemmed_tweet

In [30]:
# List stopword
stopwords = pd.read_csv("stopwords.csv", header=None)
stopwords = stopwords[0].values

# Stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [31]:
def pre_processing(tweets):
  clean_tweets = []
  for tweet in tqdm(tweets):
    clean_tw = clean_tweet(tweet)
    clean_tw = remove_stopwords(clean_tw, stopwords)
    clean_tw = stemming(clean_tw, stemmer)
    clean_tweets.append(clean_tw)
  return clean_tweets

In [13]:
train = pd.read_excel("/content/drive/Shareddrives/tk1-anamedsos/df_training_labeled_structured.xlsx")

In [16]:
train = train[['description', 'gender', 'name']]

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10867 entries, 0 to 10866
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  8222 non-null   object
 1   gender       10867 non-null  object
 2   name         10865 non-null  object
dtypes: object(3)
memory usage: 254.8+ KB


In [19]:
train = train[train['gender'] != '-']

In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10741 entries, 0 to 10866
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  8129 non-null   object
 1   gender       10741 non-null  object
 2   name         10740 non-null  object
dtypes: object(3)
memory usage: 335.7+ KB


In [21]:
train['description'] = train.description.fillna('None')

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10741 entries, 0 to 10866
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  10741 non-null  object
 1   gender       10741 non-null  object
 2   name         10740 non-null  object
dtypes: object(3)
memory usage: 335.7+ KB


In [23]:
train.dropna(inplace=True)

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10740 entries, 0 to 10866
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  10740 non-null  object
 1   gender       10740 non-null  object
 2   name         10740 non-null  object
dtypes: object(3)
memory usage: 335.6+ KB


In [32]:
desc =  train['description'].tolist() 

In [33]:
desc = pre_processing(desc)

100%|██████████| 10740/10740 [34:37<00:00,  5.17it/s]


In [34]:
train['description'] = desc

In [35]:
# train.to_csv("/content/drive/Shareddrives/tk1-anamedsos/df_name_desc.csv", index=False)

In [36]:
def clean_name(name):
  name_clean = re.sub(r"\d+", " ", name)
  name_clean = name_clean.lower()
  name_clean = re.sub(r"[^\w\s]", " ", name_clean)
  name_clean = re.sub(r"\s+", " ", name_clean)
  name_clean = name_clean.strip()
  return name_clean

In [37]:
names = train['name'].tolist()

In [38]:
names = [clean_name(name) for name in tqdm(names)]

100%|██████████| 10740/10740 [00:00<00:00, 40694.51it/s]


In [39]:
train['name'] = names

In [40]:
# train.to_csv("/content/drive/Shareddrives/tk1-anamedsos/df_name_desc.csv", index=False)

Modeling

In [166]:
desc_vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=4000)

In [167]:
name_vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=4000)

In [168]:
from sklearn.model_selection import train_test_split

In [207]:
train = pd.read_csv("/content/drive/Shareddrives/tk1-anamedsos/df_name_desc.csv")

In [208]:
train.dropna(inplace=True)

In [209]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10419 entries, 0 to 10739
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  10419 non-null  object
 1   gender       10419 non-null  object
 2   name         10419 non-null  object
dtypes: object(3)
memory usage: 325.6+ KB


In [210]:
train, val = train_test_split(train, test_size=0.5, random_state=123)

In [211]:
train_name = train['name']
train_desc = train['description']

In [212]:
val_name = val['name']
val_desc = val['description']

In [175]:
# train_vec_name = name_vectorizer.fit_transform(np.array(train_name)).todense()

In [164]:
# train_vec_desc = desc_vectorizer.fit_transform(np.array(train_desc)).todense()

In [213]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('indobenchmark/indobert-large-p1')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.71k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229k [00:00<?, ?B/s]



In [214]:
train_vec_desc = model.encode(train_desc.tolist())

In [183]:
# val_vec_name = name_vectorizer.transform(np.array(val_name)).todense()

In [215]:
val_vec_desc = model.encode(val_desc.tolist())

In [216]:
print(train_vec_name.shape)
print(train_vec_desc.shape)

(5209, 4000)
(5209, 1024)


In [217]:
print(val_vec_name.shape)
print(val_vec_desc.shape)

(5210, 4000)
(5210, 1024)


In [223]:
# X_train = np.append(train_vec_name, train_vec_desc, axis=1)
X_train = train_vec_desc

In [224]:
# X_val = np.append(val_vec_name, val_vec_desc, axis=1)
X_val = val_vec_desc

In [225]:
print(X_train.shape)
print(X_val.shape)

(5209, 1024)
(5210, 1024)


In [226]:
y_train = train.gender
y_val = val.gender

In [227]:
print(y_train.shape)
print(y_val.shape)

(5209,)
(5210,)


In [234]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score

svc = SVC(kernel="poly", class_weight='balanced')
svc.fit(X_train, y_train)

y_pred = svc.predict(X_val)
print(f1_score(y_val, y_pred, pos_label='pria'))

0.5676126878130218


In [235]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

        pria       0.55      0.59      0.57      2588
      wanita       0.56      0.51      0.54      2622

    accuracy                           0.55      5210
   macro avg       0.55      0.55      0.55      5210
weighted avg       0.55      0.55      0.55      5210



In [238]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(class_weight='balanced', n_estimators=100)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_val)
print(f1_score(y_val, y_pred_rf, pos_label='pria'))

0.5714285714285714


In [239]:
print(classification_report(y_val, y_pred_rf))

              precision    recall  f1-score   support

        pria       0.54      0.61      0.57      2588
      wanita       0.56      0.48      0.52      2622

    accuracy                           0.55      5210
   macro avg       0.55      0.55      0.54      5210
weighted avg       0.55      0.55      0.54      5210



Predict unseen data

In [240]:
test = pd.read_excel("/content/drive/Shareddrives/tk1-anamedsos/df_testing_structured.xlsx")

In [241]:
test

Unnamed: 0,id,created_at,description,followers_count,following_count,listed_count,location,name,tweet_count,username,verified
0,1.0,2017-05-12,A place to record the insights I receive on th...,94952.0,427.0,59.0,Inquiry: aisykaspol@gmail.com,‏َ,26004.0,lilithkis,0.0
1,2.0,2012-03-09,Official Twitter of Mario Teguh. \nAdmin@exnal...,9181064.0,0.0,3007.0,"Jakarta, Indonesia",Mario Teguh,55031.0,marioteguh,1.0
2,3.0,2019-11-15,Apprentice Python programmer :)\n\nhttps://t.c...,25.0,217.0,1.0,,Florentin Anggraini Purnama,51.0,flo_and_behold,0.0
3,4.0,2013-04-10,A proud Indonesian. Bangga Berbangsa.,1049638.0,309.0,402.0,Indonesia,Gita Wirjawan,17653.0,gwirjawan,1.0
4,5.0,2012-01-11,Chairman MNC Group | Ketum Partai Perindo | Ke...,551980.0,374.0,375.0,Jakarta Capital Region,Hary Tanoesoedibjo,9185.0,hary_tanoe,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1049,1050.0,2013-11-16,"S.W.I.M.M.E.R || KA bandung,indonesia",75.0,261.0,0.0,,farrel tangkas,18.0,farreltangkas,0.0
1050,1051.0,2011-07-25,IG: @fithrisyamsu,2706.0,148.0,4.0,,Fithri Syamsu,6085.0,fithrisyamsu,0.0
1051,1052.0,2021-06-28,Jonathan Xavier Hartono - Class of 2022 - Golf...,1.0,17.0,0.0,"Jakarta Capital Region, Indone",Jonathan Xavier Hartono,1.0,jojoxh_,0.0
1052,1053.0,2012-05-06,seethegooder.,1072.0,710.0,1.0,Indonesia,Dewi Putri Sungging,9892.0,dpsungging,0.0


In [242]:
test = test[['name', 'description']]

In [243]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         1054 non-null   object
 1   description  893 non-null    object
dtypes: object(2)
memory usage: 16.6+ KB


In [244]:
test.fillna("None", inplace=True)

In [245]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         1054 non-null   object
 1   description  1054 non-null   object
dtypes: object(2)
memory usage: 16.6+ KB


In [246]:
test_name = test['name'].tolist()

In [81]:
test_name = [clean_name(name) for name in test_name]
test['name'] = test_name

In [82]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         1054 non-null   object
 1   description  1054 non-null   object
dtypes: object(2)
memory usage: 16.6+ KB


In [83]:
test_desc = test['description'].tolist()
test_desc = pre_processing(test_desc)

test['description'] = test_desc

100%|██████████| 1054/1054 [05:07<00:00,  3.43it/s]


In [84]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         1054 non-null   object
 1   description  1054 non-null   object
dtypes: object(2)
memory usage: 16.6+ KB


In [125]:
test.to_csv("/content/drive/Shareddrives/tk1-anamedsos/test_name_desc.csv", index=False)

In [247]:
test = pd.read_csv("/content/drive/Shareddrives/tk1-anamedsos/test_name_desc.csv")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         1046 non-null   object
 1   description  1041 non-null   object
dtypes: object(2)
memory usage: 16.6+ KB


In [248]:
test.fillna('', inplace=True)

In [252]:
# test_vec_name = name_vectorizer.transform(np.array(test['name'])).todense()
test_vec_desc = model.encode(test['description'])

KeyboardInterrupt: ignored

In [253]:
X_test = test_vec_desc

print(X_test.shape)

(1054, 1024)


In [254]:
y_test = rf.predict(X_test)

In [255]:
y_test

array(['pria', 'pria', 'pria', ..., 'pria', 'wanita', 'pria'],
      dtype=object)

In [256]:
df_test = pd.DataFrame(y_test)

In [257]:
df_test.index = range(1,len(df_test)+1)

In [258]:
df_test

Unnamed: 0,0
1,pria
2,pria
3,pria
4,wanita
5,pria
...,...
1050,pria
1051,pria
1052,pria
1053,wanita


In [259]:
df_test.to_csv("result_name_desc.csv", header=False)