In [25]:
import random as rd
import pandas as pd
import seaborn as sns
from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import missingno as msno
import warnings
from matplotlib import style
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

In [26]:
df = pd.read_csv('adult.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


<h3>1. Cek Null Values</h3>
<p>Langkah pertama yang kami lakukan adalah melakukan pengecekan terhadap null values</p>

In [27]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [28]:
df.workclass.unique()

array(['?', 'Private', 'State-gov', 'Federal-gov', 'Self-emp-not-inc',
       'Self-emp-inc', 'Local-gov', 'Without-pay', 'Never-worked'],
      dtype=object)

<p>Setelah mengeksplorasi dataset, kami menemukan kalau null values pada dataset ini memiliki value "?". Maka dari itu kami akan mengganti value "?" ini dengan None atau NaN agar null value menjadi lebih mudah diolah</p>

In [29]:
#Ganti value "?" menjadi none agar dapat lebih mudah mengolah null valuenya
df = df.replace({'?': None})
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

<p>Setelah kami cek null valuenya, ternyata atribut yang memiliki null value adalah atribut workclass, occupation, dan native.country. Langkah selanjutnya kami akan melakukan replace null value tersebut. Dikarenakan atribut-atribut tersebut merupakan atribut categorical, maka null value nya akan kami ganti menjadi nilai modus masing masing atribut</p>

In [30]:
df.fillna(df.mode().iloc[0], inplace=True)

In [31]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

<p>Null value pada dataset sudah tidak ada</p>

<h3>2. Encode Atribut Categorical</h3>
<p>Pada tahap ini kami mengkonversi atribut categorical menjadi numerical</p>

In [32]:
categorical = df.dtypes==object
categorical_cols = df.columns[categorical].tolist()
df[categorical_cols] = df[categorical_cols].apply(lambda col: LabelEncoder().fit_transform(col))
df[categorical_cols].head()
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,3,77053,11,9,6,9,1,4,0,0,4356,40,38,0
1,82,3,132870,11,9,6,3,1,4,0,0,4356,18,38,0
2,66,3,186061,15,10,6,9,4,2,0,0,4356,40,38,0
3,54,3,140359,5,4,0,6,4,4,0,0,3900,40,38,0
4,41,3,264663,15,10,5,9,3,4,0,0,3900,40,38,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,3,310152,15,10,4,10,1,4,1,0,0,40,38,0
32557,27,3,257302,7,12,2,12,5,4,0,0,0,38,38,0
32558,40,3,154374,11,9,2,6,0,4,1,0,0,40,38,1
32559,58,3,151910,11,9,6,0,4,4,0,0,0,40,38,0


<h3>3. Feature Scaling</h3>
<p>Langkah terakhir yang kammi lakukan adalah melakukan standarisasi pada tiap atribut</p>

In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

In [38]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,3.769612,-0.085296,-1.067997,0.181332,-0.420060,2.249480,0.720237,-0.277805,0.393668,-1.422331,-0.14592,10.593507,-0.035429,0.26137,-0.563199
1,3.183112,-0.085296,-0.539169,0.181332,-0.420060,2.249480,-0.790092,-0.277805,0.393668,-1.422331,-0.14592,10.593507,-1.817204,0.26137,-0.563199
2,2.010110,-0.085296,-0.035220,1.214869,-0.031360,2.249480,0.720237,1.589322,-1.962621,-1.422331,-0.14592,10.593507,-0.035429,0.26137,-0.563199
3,1.130359,-0.085296,-0.468215,-1.368974,-2.363558,-1.734058,-0.034928,1.589322,0.393668,-1.422331,-0.14592,9.461864,-0.035429,0.26137,-0.563199
4,0.177296,-0.085296,0.709482,1.214869,-0.031360,1.585557,0.720237,0.966947,0.393668,-1.422331,-0.14592,9.461864,-0.035429,0.26137,-0.563199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-1.215643,-0.085296,1.140460,1.214869,-0.031360,0.921634,0.971958,-0.277805,0.393668,0.703071,-0.14592,-0.216660,-0.035429,0.26137,-0.563199
32557,-0.849080,-0.085296,0.639741,-0.852205,0.746039,-0.406212,1.475401,2.211698,0.393668,-1.422331,-0.14592,-0.216660,-0.197409,0.26137,-0.563199
32558,0.103983,-0.085296,-0.335433,0.181332,-0.420060,-0.406212,-0.034928,-0.900181,0.393668,0.703071,-0.14592,-0.216660,-0.035429,0.26137,1.775573
32559,1.423610,-0.085296,-0.358777,0.181332,-0.420060,2.249480,-1.545256,1.589322,0.393668,-1.422331,-0.14592,-0.216660,-0.035429,0.26137,-0.563199
