In [5]:
import pandas as pd                 # pandas is a dataframe library
df = pd.read_csv("data-pemilih-kpu.csv", encoding = 'utf-8-sig')

In [6]:
#dimensi dataset terdiri dari 13137 baris dan 2 kolom
df.shape

(13137, 2)

In [7]:
#melihat 5 baris pertam dataset
df.head(5)

Unnamed: 0,nama,jenis_kelamin
13132,HERMANSYAH,Laki-Laki
13133,SITA.HJ,Perempuan
13134,MASNI TAMBUNAN,Perempuan
13135,MARJANEDI,Laki-Laki
13136,NGALIMAN,Laki-Laki


In [8]:
#melihat 5 baris terakhir dataset
df.tail(5)

Unnamed: 0,nama,jenis_kelamin
13132,HERMANSYAH,Laki-Laki
13133,SITA.HJ,Perempuan
13134,MASNI TAMBUNAN,Perempuan
13135,MARJANEDI,Laki-Laki
13136,NGALIMAN,Laki-Laki


In [9]:
# mengecek apakah ada data yang berisi null
df.isnull().values.any()

True

In [10]:
# mengecek jumlah baris data yang berisi null
len(df[pd.isnull(df).any(axis=1)])

187

In [11]:
# menghapus baris null dan recheck kembali
df = df.dropna(how='all')
len(df[pd.isnull(df).any(axis=1)])

0

In [12]:
# mengecek dimensi dataset
df.shape

(12950, 2)

In [13]:
# mengubah isi kolom jenis kelamin dari text menjadi integer (Laki-laki = 1; Perempuan= 0)
jk_map = {"Laki-Laki" : 1, "Perempuan" : 0}
df["jenis_kelamin"] = df["jenis_kelamin"].map(jk_map)

In [14]:
# cek kembali data apakah telah berubah
df.head(5)

Unnamed: 0,nama,jenis_kelamin
0,ERWIN TJAHJONO,1
1,DAVIANDRIE ANDIKA BAHROENY,1
2,ELAN KURNIAWAN,1
3,AYU DWI CAHYANING MUKTI,0
4,WAHYOEDIN,1


In [15]:
# Mengecek distribusi jenis kelamin pada dataset

num_obs = len(df)
num_true = len(df.loc[df['jenis_kelamin'] == 1])
num_false = len(df.loc[df['jenis_kelamin'] == 0])
print("Jumlah Pria:  {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Jumlah Wanita: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))

Jumlah Pria:  6162 (47.58%)
Jumlah Wanita: 6788 (52.42%)


In [16]:
from sklearn.model_selection import train_test_split

feature_col_names = ["nama"]
predicted_class_names = ["jenis_kelamin"]

X = df[feature_col_names].values     
y = df[predicted_class_names].values
split_test_size = 0.30

text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, stratify=y, random_state=42)


In [17]:
print("Dataset Asli Pria       : {0} ({1:0.2f}%)".format(len(df.loc[df['jenis_kelamin'] == 1]), (len(df.loc[df['jenis_kelamin'] == 1])/len(df.index)) * 100.0))
print("Dataset Asli Wanita     : {0} ({1:0.2f}%)".format(len(df.loc[df['jenis_kelamin'] == 0]), (len(df.loc[df['jenis_kelamin'] == 0])/len(df.index)) * 100.0))
print("")
print("Dataset Training Pria   : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("Dataset Training Wanita : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("")
print("Dataset Test Pria       : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))
print("Dataset Test Wanita     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))

Dataset Asli Pria       : 6162 (47.58%)
Dataset Asli Wanita     : 6788 (52.42%)

Dataset Training Pria   : 4313 (47.58%)
Dataset Training Wanita : 4752 (52.42%)

Dataset Test Pria       : 1849 (47.59%)
Dataset Test Wanita     : 2036 (52.41%)


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = 'char_wb', ngram_range=(2,6))
vectorizer.fit(text_train.ravel())

X_train = vectorizer.transform(text_train.ravel())
X_test = vectorizer.transform(text_test.ravel())


In [20]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
# dataset training
print(clf.score(X_train, y_train))

# dataset test
print(clf.score(X_test, y_test))

0.9966905681191396
0.9364221364221365


In [23]:
from sklearn import metrics

clf_predict = clf.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, clf_predict)))
print(metrics.confusion_matrix(y_test, clf_predict, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, clf_predict, labels=[1,0]))

Accuracy: 0.9364
[[1727  122]
 [ 125 1911]]

Classification Report
              precision    recall  f1-score   support

           1       0.93      0.93      0.93      1849
           0       0.94      0.94      0.94      2036

   micro avg       0.94      0.94      0.94      3885
   macro avg       0.94      0.94      0.94      3885
weighted avg       0.94      0.94      0.94      3885



In [24]:
jk_label = {1:"Laki-Laki", 0:"Perempuan"}

test_predict = vectorizer.transform(["niky felina"])
res = clf.predict(test_predict)

print(jk_label[int(res)])

Perempuan


In [None]:
from sklearn.pipeline import Pipeline

clf_lg = Pipeline([('vect', CountVectorizer(analyzer = 'char_wb', ngram_range=(2,6))),
                     ('clf', LogisticRegression()),
])
_ = clf_lg.fit(text_train.ravel(), y_train.ravel())
predicted = clf_lg.predict(text_test.ravel())
np.mean(predicted == y_test.ravel())