# Target Variable: Tip

In [1]:
import pandas as pd
import numpy as np
# 결측치 제거
from sklearn.impute import SimpleImputer
# 정규화, 표준화
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
# RFE
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
# PCA
from sklearn.decomposition import PCA
# regression feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [2]:
tips = pd.read_csv('tips2.csv', index_col=0)
tips.tail(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
243,18.78,3.0,Female,No,Thur,Dinner,2.0
244,16.99,,Female,No,Sun,Dinner,2.0
245,20.34,1.66,,No,Sun,Dinner,3.0
246,13.23,2.66,Male,Yes,Sat,,
247,26.34,2.2,Female,No,Fri,Lunch,4.0


In [3]:
# 변수 형태 변환
tips['sex'].replace({'Female': 0, 'Male':1}, inplace=True)
tips['smoker'].replace({'No': 0, 'Yes':1}, inplace=True)
tips['day'].replace({'Thur': 0, 'Fri':1, 'Sat': 2, 'Sun': 3}, inplace=True)
tips['time'].replace({'Lunch': 0, 'Dinner':1}, inplace=True)
tips.tail(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
243,18.78,3.0,0.0,0,0,1.0,2.0
244,16.99,,0.0,0,3,1.0,2.0
245,20.34,1.66,,0,3,1.0,3.0
246,13.23,2.66,1.0,1,2,,
247,26.34,2.2,0.0,0,1,0.0,4.0


In [4]:
# 결측치 제거
data = tips.values
imputer = SimpleImputer(strategy='median')
imputer.fit(data)
data_trans = imputer.transform(data)
tips_imp = pd.DataFrame(data_trans, columns=['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'])
tips_imp.tail(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
243,18.78,3.0,0.0,0.0,0.0,1.0,2.0
244,16.99,2.83,0.0,0.0,3.0,1.0,2.0
245,20.34,1.66,1.0,0.0,3.0,1.0,3.0
246,13.23,2.66,1.0,1.0,2.0,1.0,2.0
247,26.34,2.2,0.0,0.0,1.0,0.0,4.0


In [5]:
# 결측치 여부 확인
tips_imp.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

## RFE

In [6]:
# 목표변수 설정: tip(1)
n = 1
y = tips_imp.iloc[:,n]
X = tips_imp.drop(tips_imp.columns[[n]], axis=1)
# 특징 추출
estimator = SVR(kernel="linear")
rfe = RFE(estimator, n_features_to_select=4)
rfe.fit(X, y)
for i in range(X.shape[1]):
    print('Column: %d, Rank: %d, Selected=%s' % (i, rfe.ranking_[i], rfe.support_[i]))

Column: 0, Rank: 1, Selected=True
Column: 1, Rank: 2, Selected=False
Column: 2, Rank: 3, Selected=False
Column: 3, Rank: 1, Selected=True
Column: 4, Rank: 1, Selected=True
Column: 5, Rank: 1, Selected=True


In [7]:
tips_RFE = tips_imp.copy()
for i in range(X.shape[1]):
    if rfe.support_[i] == False:
        if i < n:
            k = i
        else:
            k = i+1
        tips_RFE.drop(tips_imp.columns[[k]], axis=1,
                      inplace=True)
tips_RFE

Unnamed: 0,total_bill,tip,day,time,size
0,16.99,1.01,3.0,1.0,2.0
1,10.34,1.66,3.0,1.0,3.0
2,21.01,3.50,3.0,1.0,3.0
3,23.68,3.31,3.0,1.0,2.0
4,24.59,3.61,3.0,1.0,4.0
...,...,...,...,...,...
243,18.78,3.00,0.0,1.0,2.0
244,16.99,2.83,3.0,1.0,2.0
245,20.34,1.66,3.0,1.0,3.0
246,13.23,2.66,2.0,1.0,2.0


In [8]:
# 정규화
trans = MinMaxScaler()
tips_N = trans.fit_transform(tips_RFE)
tips_RFE_norm = pd.DataFrame(tips_N, columns=[tips_RFE.columns])
tips_RFE_norm.describe()

Unnamed: 0,total_bill,tip,day,time,size
count,248.0,248.0,248.0,248.0,248.0
mean,0.349956,0.220847,0.577957,0.721774,0.314516
std,0.185412,0.152904,0.384375,0.449031,0.189905
min,0.0,0.0,0.0,0.0,0.0
25%,0.215281,0.111111,0.25,0.0,0.2
50%,0.308442,0.203333,0.666667,1.0,0.2
75%,0.441087,0.28,1.0,1.0,0.4
max,1.0,1.0,1.0,1.0,1.0


In [9]:
# 표준화
sc = StandardScaler()
tips_S = sc.fit_transform(tips_RFE)
tips_RFE_stan = pd.DataFrame(tips_S, columns=[tips_RFE.columns])
tips_RFE_stan.describe().round()

Unnamed: 0,total_bill,tip,day,time,size
count,248.0,248.0,248.0,248.0,248.0
mean,-0.0,0.0,-0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0
min,-2.0,-1.0,-2.0,-2.0,-2.0
25%,-1.0,-1.0,-1.0,-2.0,-1.0
50%,-0.0,-0.0,0.0,1.0,-1.0
75%,0.0,0.0,1.0,1.0,0.0
max,4.0,5.0,1.0,1.0,4.0


## PCA

In [10]:
trans = PCA(n_components=4)
tips_PCA = trans.fit_transform(tips_imp)
tips_PCA[:3,:]

array([[-2.98500905, -1.14716027,  1.78842197,  0.01180337],
       [-9.44192098, -1.5997611 ,  0.23424027,  1.04198106],
       [ 1.33645028, -1.33819469, -0.34697112,  0.2621155 ]])

## regression feature selection

In [11]:
# 목표변수 설정: tip(1)
n = 1
y = tips_imp.iloc[:,n]
X = tips_imp.drop(tips_imp.columns[[n]], axis=1)
# 특징 추출
fs = SelectKBest(score_func=f_regression, k=4)
tips_RFS = fs.fit_transform(X, y)
fs.get_support(indices=True)

array([0, 3, 4, 5])

In [12]:
selectC = fs.get_support(indices=True)
tips_RFS = tips_imp.copy()
for i in range(X.shape[1]):
    if i not in selectC:
        if i < n:
            k = i
        else:
            k = i+1
        tips_RFS.drop(tips_imp.columns[[k]], axis=1,
                      inplace=True)
tips_RFS

Unnamed: 0,total_bill,tip,day,time,size
0,16.99,1.01,3.0,1.0,2.0
1,10.34,1.66,3.0,1.0,3.0
2,21.01,3.50,3.0,1.0,3.0
3,23.68,3.31,3.0,1.0,2.0
4,24.59,3.61,3.0,1.0,4.0
...,...,...,...,...,...
243,18.78,3.00,0.0,1.0,2.0
244,16.99,2.83,3.0,1.0,2.0
245,20.34,1.66,3.0,1.0,3.0
246,13.23,2.66,2.0,1.0,2.0


In [13]:
# 정규화
trans = MinMaxScaler()
tips_N = trans.fit_transform(tips_RFS)
tips_RFS_norm = pd.DataFrame(tips_N, columns=[tips_RFE.columns])
tips_RFS_norm.describe()

Unnamed: 0,total_bill,tip,day,time,size
count,248.0,248.0,248.0,248.0,248.0
mean,0.349956,0.220847,0.577957,0.721774,0.314516
std,0.185412,0.152904,0.384375,0.449031,0.189905
min,0.0,0.0,0.0,0.0,0.0
25%,0.215281,0.111111,0.25,0.0,0.2
50%,0.308442,0.203333,0.666667,1.0,0.2
75%,0.441087,0.28,1.0,1.0,0.4
max,1.0,1.0,1.0,1.0,1.0


In [14]:
# 표준화
sc = StandardScaler()
tips_S = sc.fit_transform(tips_RFS)
tips_RFS_stan = pd.DataFrame(tips_S, columns=[tips_RFE.columns])
tips_RFS_stan.describe().round()

Unnamed: 0,total_bill,tip,day,time,size
count,248.0,248.0,248.0,248.0,248.0
mean,-0.0,0.0,-0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0
min,-2.0,-1.0,-2.0,-2.0,-2.0
25%,-1.0,-1.0,-1.0,-2.0,-1.0
50%,-0.0,-0.0,0.0,1.0,-1.0
75%,0.0,0.0,1.0,1.0,0.0
max,4.0,5.0,1.0,1.0,4.0
