In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing   
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer # melakukan transformasi (fit transform = transformer)
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectPercentile, RFE

# Utilities
import warnings
warnings.filterwarnings("ignore")
from sklearn.utils.testing import ignore_warnings

In [2]:
df = pd.read_csv('8. adult.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [3]:
df.replace("?",np.nan,inplace=True)

## Data Preprocessing

In [4]:
pipeline_binary = Pipeline([
    ('simple imputer',SimpleImputer(strategy='constant',fill_value='NC')),
    ('Binary encoding',ce.BinaryEncoder())
])

In [5]:
transformer = ColumnTransformer([
    ('one hot',OneHotEncoder(drop='first'),['relationship','race','sex']),
    ('binary',pipeline_binary,['workclass','marital.status','occupation','native.country']),
],remainder='passthrough')

## Data Splitting

In [6]:
X = df.drop(['income','fnlwgt','education'],axis = 1)
y = np.where(df['income']=='>50K',1,0)

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                stratify=y,
                                                random_state=10,
                                                test_size=0.3)

## Data Transform

In [8]:
X_train_prep = transformer.fit_transform(X_train)
X_test_prep = transformer.transform(X_test)

In [9]:
X_train_prep=pd.DataFrame(X_train_prep)
X_test_prep=pd.DataFrame(X_test_prep)

In [10]:
transformer.transformers_[0][1].get_feature_names()

array(['x0_Not-in-family', 'x0_Other-relative', 'x0_Own-child',
       'x0_Unmarried', 'x0_Wife', 'x1_Asian-Pac-Islander', 'x1_Black',
       'x1_Other', 'x1_White', 'x2_Male'], dtype=object)

In [11]:
transformer.transformers_[1][1]

Pipeline(steps=[('simple imputer',
                 SimpleImputer(fill_value='NC', strategy='constant')),
                ('Binary encoding', BinaryEncoder())])

In [12]:
transformer.transformers_[1][1]['Binary encoding'].get_feature_names()

['0_0',
 '0_1',
 '0_2',
 '0_3',
 '0_4',
 '1_0',
 '1_1',
 '1_2',
 '1_3',
 '2_0',
 '2_1',
 '2_2',
 '2_3',
 '2_4',
 '3_0',
 '3_1',
 '3_2',
 '3_3',
 '3_4',
 '3_5',
 '3_6']

In [13]:
feature = list(transformer.transformers_[0][1].get_feature_names())+transformer.transformers_[1][1]['Binary encoding'].get_feature_names()+['age','education_num','capital.gain','capital.loss','hours.per.week']

In [14]:
X_train_prep.columns=feature
X_test_prep.columns=feature

## Select Percentile

In [15]:
select = SelectPercentile(percentile=50)
select.fit(X_train_prep,y_train)

SelectPercentile(percentile=50)

In [16]:
X_train_prep_selected = select.transform(X_train_prep)
X_test_prep_selected = select.transform(X_test_prep)

In [17]:
print('before selection',X_train_prep.shape)
print('after selection',X_train_prep_selected.shape)

before selection (22792, 36)
after selection (22792, 18)


feature yang diambil sebanyak 18, 50% dari base feature

melihat model apa saja yang terpilih

In [18]:
mask=select.get_support() #membuka feature mana saja yang terpilih berdasarkan select percentile

In [19]:
selected_features = pd.DataFrame({'indicator':mask,'features':feature})
selected_features[selected_features['indicator']==True]

Unnamed: 0,indicator,features
0,True,x0_Not-in-family
1,True,x0_Other-relative
2,True,x0_Own-child
3,True,x0_Unmarried
4,True,x0_Wife
6,True,x1_Black
9,True,x2_Male
12,True,0_2
16,True,1_1
17,True,1_2


## Model Fitting and evaluation
menggunakan logistic regression

### Sebelum dilakukan univariate feature selection

In [20]:
log = LogisticRegression()
log.fit(X_test_prep,y_test)
print('score =',accuracy_score(y_test,log.predict(X_test_prep)))

score = 0.8448152318558706


### Setelah dilakukan univariate feature selection

In [21]:
log = LogisticRegression()
log.fit(X_test_prep_selected,y_test)
print('score =',accuracy_score(y_test,log.predict(X_test_prep_selected)))

score = 0.8390828129798341


terdapat penurunan performa setelah dilakukan univariate feature selection, hal ini bisa disebabkan oleh feature yang dihilangkan kemungkinan memiliki keterkaitan dengan feature lainnya, sehingga saat feature tersebut hilang, feature yang masih ada menjadi tidak terlalu berpangaruh terhadap model dibandingkan sebelum feature yang dihilangkan hilang