In [1]:
# 필요한 모듈 import

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
# 데이터 파일 업로드

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sub = pd.read_csv('./sample_submission.csv')

In [3]:
# EDA 및 데이터 전처리

In [5]:
train.head()

Unnamed: 0,id,Overall Qual,Gr Liv Area,Exter Qual,Garage Cars,Garage Area,Kitchen Qual,Total Bsmt SF,1st Flr SF,Bsmt Qual,Full Bath,Year Built,Year Remod/Add,Garage Yr Blt,target
0,1,10,2392,Ex,3,968,Ex,2392,2392,Ex,2,2003,2003,2003,386250
1,2,7,1352,Gd,2,466,Gd,1352,1352,Ex,2,2006,2007,2006,194000
2,3,5,900,TA,1,288,TA,864,900,TA,1,1967,1967,1967,123000
3,4,5,1174,TA,2,576,Gd,680,680,TA,1,1900,2006,2000,135000
4,5,7,1958,Gd,3,936,Gd,1026,1026,Gd,2,2005,2005,2005,250000


In [6]:
test.head()

Unnamed: 0,id,Overall Qual,Gr Liv Area,Exter Qual,Garage Cars,Garage Area,Kitchen Qual,Total Bsmt SF,1st Flr SF,Bsmt Qual,Full Bath,Year Built,Year Remod/Add,Garage Yr Blt
0,1,9,1800,Gd,2,702,Ex,1800,1800,Ex,2,2007,2007,2007
1,2,6,1082,TA,1,240,TA,1082,1082,TA,1,1948,1950,1948
2,3,6,1573,Gd,2,440,Gd,756,769,Gd,2,2000,2000,2000
3,4,6,2443,Gd,3,744,Gd,1158,1158,Gd,2,2004,2004,2004
4,5,5,1040,TA,2,686,TA,1040,1040,TA,1,1968,1968,1991


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1350 entries, 0 to 1349
Data columns (total 15 columns):
id                1350 non-null int64
Overall Qual      1350 non-null int64
Gr Liv Area       1350 non-null int64
Exter Qual        1350 non-null object
Garage Cars       1350 non-null int64
Garage Area       1350 non-null int64
Kitchen Qual      1350 non-null object
Total Bsmt SF     1350 non-null int64
1st Flr SF        1350 non-null int64
Bsmt Qual         1350 non-null object
Full Bath         1350 non-null int64
Year Built        1350 non-null int64
Year Remod/Add    1350 non-null int64
Garage Yr Blt     1350 non-null int64
target            1350 non-null int64
dtypes: int64(12), object(3)
memory usage: 158.3+ KB


In [8]:
test.info() # train, test 모두 결측치 없음, train과 test 비율 1:1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1350 entries, 0 to 1349
Data columns (total 14 columns):
id                1350 non-null int64
Overall Qual      1350 non-null int64
Gr Liv Area       1350 non-null int64
Exter Qual        1350 non-null object
Garage Cars       1350 non-null int64
Garage Area       1350 non-null int64
Kitchen Qual      1350 non-null object
Total Bsmt SF     1350 non-null int64
1st Flr SF        1350 non-null int64
Bsmt Qual         1350 non-null object
Full Bath         1350 non-null int64
Year Built        1350 non-null int64
Year Remod/Add    1350 non-null int64
Garage Yr Blt     1350 non-null int64
dtypes: int64(11), object(3)
memory usage: 147.8+ KB


In [8]:
# 품질 부분 encoding
# 나머지 scaling 후 회귀분석

In [4]:
# encoding을 위해 품질 부분 고유값 확인

print(train['Exter Qual'].unique(),train['Kitchen Qual'].unique(),train['Bsmt Qual'].unique())
print(test['Exter Qual'].unique(),test['Kitchen Qual'].unique(),test['Bsmt Qual'].unique())
# train과 test의 kichen qual 부분 고유값이 다름 -> label encoding, 나머지는 one hot encoding

['Ex' 'Gd' 'TA' 'Fa'] ['Ex' 'Gd' 'TA' 'Fa'] ['Ex' 'TA' 'Gd' 'Fa' 'Po']
['Gd' 'TA' 'Ex' 'Fa'] ['Ex' 'TA' 'Gd' 'Fa' 'Po'] ['Ex' 'TA' 'Gd' 'Fa' 'Po']


In [5]:
# label encoding (for문 이용)

train_kichen = []
test_kichen = []

# train data
for i in range(len(train)):
    if train['Kitchen Qual'][i]=='Ex':
        train_kichen.append(0)
    elif train['Kitchen Qual'][i]=='Gd':
        train_kichen.append(1)
    elif train['Kitchen Qual'][i]=='TA':
        train_kichen.append(2)
    elif train['Kitchen Qual'][i]=='Fa':
        train_kichen.append(3)
    else:
        train_kichen.append(4)

# test data
for i in range(len(train)):
    if test['Kitchen Qual'][i]=='Ex':
        test_kichen.append(0)
    elif test['Kitchen Qual'][i]=='Gd':
        test_kichen.append(1)
    elif test['Kitchen Qual'][i]=='TA':
        test_kichen.append(2)
    elif test['Kitchen Qual'][i]=='Fa':
        test_kichen.append(3)
    else:
        test_kichen.append(4)
        
train['Kitchen Qual'] = train_kichen
test['Kitchen Qual'] = test_kichen

In [6]:
# One Hot encoding

train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [7]:
# X, y 정의
X = train.drop(['id', 'target'],axis=1)
y = train.target
X_t = test.drop(['id'], axis=1)

In [8]:
# scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_t_scaled = scaler.transform(X_t)

In [9]:
# 학습, 검증 데이터 분리

X_train, X_val, y_train, y_val = train_test_split(X_scaled,y,test_size=0.3,random_state=42)

In [18]:
# SGD reg 사용
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor()
sgd.fit(X_train, y_train)
y_val_pred = sgd.predict(X_val)

In [10]:
# 평가식 정의
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [20]:
# 모델 학습 검증
NMAE(y_val, y_val_pred)

0.10656182796714901

In [22]:
# 제출 파일 생성
y_pred = sgd.predict(X_t_scaled)
sub['target'] = y_pred
sub.to_csv('./sample_submission.csv', index=False)

In [33]:
# 하이퍼 파라미터 튜닝
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

params = {'penalty':['l2', 'l1', 'elasticnet'], 'loss':['squared_error', 'huber','epsilon_insensitive','squared_epsilon_insensitive'],
         'alpha':np.arange(0,0.01,0.00005)}
grid = GridSearchCV(sgd, param_grid=params, n_jobs=-1,scoring=mean_absolute_error,cv=10, verbose=1)
grid.fit(X_train, y_train)

Fitting 10 folds for each of 2400 candidates, totalling 24000 fits




GridSearchCV(cv=10, estimator=SGDRegressor(), n_jobs=-1,
             param_grid={'alpha': array([0.00e+00, 5.00e-05, 1.00e-04, 1.50e-04, 2.00e-04, 2.50e-04,
       3.00e-04, 3.50e-04, 4.00e-04, 4.50e-04, 5.00e-04, 5.50e-04,
       6.00e-04, 6.50e-04, 7.00e-04, 7.50e-04, 8.00e-04, 8.50e-04,
       9.00e-04, 9.50e-04, 1.00e-03, 1.05e-03, 1.10e-03, 1.15e-03,
       1.20e-03, 1.25e-03, 1.30e-03, 1.35e-03, 1.40e-03, 1.45e-03...
       9.00e-03, 9.05e-03, 9.10e-03, 9.15e-03, 9.20e-03, 9.25e-03,
       9.30e-03, 9.35e-03, 9.40e-03, 9.45e-03, 9.50e-03, 9.55e-03,
       9.60e-03, 9.65e-03, 9.70e-03, 9.75e-03, 9.80e-03, 9.85e-03,
       9.90e-03, 9.95e-03]),
                         'loss': ['squared_error', 'huber',
                                  'epsilon_insensitive',
                                  'squared_epsilon_insensitive'],
                         'penalty': ['l2', 'l1', 'elasticnet']},
             scoring=<function mean_absolute_error at 0x000002ADFC1D2E58>,
             verbos

In [34]:
grid.best_params_ # default값...

{'alpha': 0.0, 'loss': 'squared_error', 'penalty': 'l2'}

In [37]:
# 새로운 모델 생성 및 학습

sgd2 = SGDRegressor(alpha=0) # alpha를 아예 0으로 줌(default=0.0001)
sgd2.fit(X_train, y_train)
y_val_pred = sgd2.predict(X_val)

NMAE(y_val, y_val_pred)

0.10648460462217416

In [39]:
# 제출 파일 생성
y_pred = sgd2.predict(X_t_scaled)
sub['target'] = y_pred
sub.to_csv('./sample_submission2.csv', index=False) # 제출 점수: NMAE 0.1140391906

In [41]:
# 다른 모델 선택
# DT
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_val_pred = dt.predict(X_val)

NMAE(y_val, y_val_pred)

0.1323365796662401

In [11]:
# RF
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_val_pred = rf.predict(X_val)

NMAE(y_val, y_val_pred) # 오 성능 좋은데

0.09942843716480464

In [12]:
# 제출 파일 생성 - RF
y_pred = rf.predict(X_t_scaled)
sub['target'] = y_pred
sub.to_csv('./sample_submission3.csv', index=False) # 제출 점수: NMAE 0.1053759727

In [13]:
# RF 하이퍼 파라미터 튜닝

In [31]:
# max_depth 튜닝 -> 11 or 106 or 203 

for depth in range(3,30):
    rf2 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', max_depth = depth )
    rf2.fit(X_train, y_train)
    y_val_pred = rf2.predict(X_val)
    print('depth = {}, NMAE = {}'.format(depth,NMAE(y_val, y_val_pred)))

depth = 3, NMAE = 0.12936213631368698
depth = 4, NMAE = 0.11427749375546589
depth = 5, NMAE = 0.10282244255366577
depth = 6, NMAE = 0.10029363735247898
depth = 7, NMAE = 0.09822006498004565
depth = 8, NMAE = 0.09743499891660537
depth = 9, NMAE = 0.0969205028605097
depth = 10, NMAE = 0.09691024865038443
depth = 11, NMAE = 0.096131816270191
depth = 12, NMAE = 0.09867870854899755
depth = 13, NMAE = 0.09690236090752563
depth = 14, NMAE = 0.0972625454699172
depth = 15, NMAE = 0.09715078936176738
depth = 16, NMAE = 0.097629384392278
depth = 17, NMAE = 0.0973514517651783
depth = 18, NMAE = 0.09671835862534069
depth = 19, NMAE = 0.09832978387311507
depth = 20, NMAE = 0.09704967369178534
depth = 21, NMAE = 0.09851691621719957
depth = 22, NMAE = 0.09782395576152932
depth = 23, NMAE = 0.09689896926868011
depth = 24, NMAE = 0.09690364798425254
depth = 25, NMAE = 0.09688630631887854
depth = 26, NMAE = 0.0969560509377287
depth = 27, NMAE = 0.09802472397724561
depth = 28, NMAE = 0.09639915900222157
d

In [32]:
for depth in range(100,111):
    rf2 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', max_depth = depth )
    rf2.fit(X_train, y_train)
    y_val_pred = rf2.predict(X_val)
    print('depth = {}, NMAE = {}'.format(depth,NMAE(y_val, y_val_pred)))

depth = 100, NMAE = 0.09734943686782971
depth = 101, NMAE = 0.0973226507977965
depth = 102, NMAE = 0.09703212368382004
depth = 103, NMAE = 0.09659045471919271
depth = 104, NMAE = 0.09604716373463092
depth = 105, NMAE = 0.09750162102220078
depth = 106, NMAE = 0.09587289513429463
depth = 107, NMAE = 0.09682103305761527
depth = 108, NMAE = 0.09739221254252589
depth = 109, NMAE = 0.09703443685109138
depth = 110, NMAE = 0.09800696372311751


In [33]:
for depth in range(200,221):
    rf2 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', max_depth = depth )
    rf2.fit(X_train, y_train)
    y_val_pred = rf2.predict(X_val)
    print('depth = {}, NMAE = {}'.format(depth,NMAE(y_val, y_val_pred)))

depth = 200, NMAE = 0.09682073222771738
depth = 201, NMAE = 0.09751444387198738
depth = 202, NMAE = 0.09730029499446753
depth = 203, NMAE = 0.09546182266422118
depth = 204, NMAE = 0.09702621289816563
depth = 205, NMAE = 0.09671707870341589
depth = 206, NMAE = 0.09777560439535918
depth = 207, NMAE = 0.09714241909641678
depth = 208, NMAE = 0.09838688444659538
depth = 209, NMAE = 0.09598972124036957
depth = 210, NMAE = 0.09721038866789589
depth = 211, NMAE = 0.09606596589863692
depth = 212, NMAE = 0.09716202305776162
depth = 213, NMAE = 0.09795587088614462
depth = 214, NMAE = 0.09712540733963244
depth = 215, NMAE = 0.0966026366562314
depth = 216, NMAE = 0.09623314494323182
depth = 217, NMAE = 0.09787556919241906
depth = 218, NMAE = 0.0971811032088107
depth = 219, NMAE = 0.09667874064714733
depth = 220, NMAE = 0.09718798475016244


In [35]:
# max_features 튜닝 -> log2 결정
for feature in ['auto', 'sqrt', 'log2']:
    rf2 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', max_depth = 203, max_features = feature)
    rf2.fit(X_train, y_train)
    y_val_pred = rf2.predict(X_val)
    print('feature = {}, NMAE = {}'.format(feature,NMAE(y_val, y_val_pred)))

feature = auto, NMAE = 0.09640615449528678
feature = sqrt, NMAE = 0.0948209639382868
feature = log2, NMAE = 0.091994318890063


In [39]:
# min_samples_split 튜닝 -> 9 결정
for n in range(2,51):
    rf2 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', max_depth = 203, max_features = 'log2',
                               min_samples_split = n)
    rf2.fit(X_train, y_train)
    y_val_pred = rf2.predict(X_val)
    print('n= {}, NMAE = {}'.format(n,NMAE(y_val, y_val_pred)))

n= 2, NMAE = 0.09548448717429484
n= 3, NMAE = 0.0927526539557556
n= 4, NMAE = 0.0929193401066775
n= 5, NMAE = 0.09346709627862985
n= 6, NMAE = 0.09389443141738014
n= 7, NMAE = 0.09409705220176566
n= 8, NMAE = 0.09330072966511647
n= 9, NMAE = 0.09213736688703672
n= 10, NMAE = 0.0935987740476023
n= 11, NMAE = 0.09446793266845305
n= 12, NMAE = 0.09427156222744877
n= 13, NMAE = 0.09571440287300102
n= 14, NMAE = 0.09457859284468567
n= 15, NMAE = 0.0952828107581708
n= 16, NMAE = 0.09550004696569755
n= 17, NMAE = 0.09629495790580382
n= 18, NMAE = 0.09381233964463478
n= 19, NMAE = 0.09643338600099265
n= 20, NMAE = 0.09703679366922086
n= 21, NMAE = 0.09721490459530632
n= 22, NMAE = 0.09590295377150972
n= 23, NMAE = 0.09567721916859487
n= 24, NMAE = 0.09661448487753768
n= 25, NMAE = 0.09703256143956719
n= 26, NMAE = 0.09779299135260096
n= 27, NMAE = 0.09697990065064718
n= 28, NMAE = 0.09822227948981396
n= 29, NMAE = 0.09758629863238907
n= 30, NMAE = 0.09751027091560395
n= 31, NMAE = 0.1007485761

In [40]:
# min_samples_leaf 튜닝 -> 1 결정
for n in range(1,51):
    rf2 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', max_depth = 203, max_features = 'log2',
                               min_samples_split = 9, min_samples_leaf = n)
    rf2.fit(X_train, y_train)
    y_val_pred = rf2.predict(X_val)
    print('n= {}, NMAE = {}'.format(n,NMAE(y_val, y_val_pred)))

n= 1, NMAE = 0.09431679370597948
n= 2, NMAE = 0.09526239928877316
n= 3, NMAE = 0.09502730279116711
n= 4, NMAE = 0.09527785753449579
n= 5, NMAE = 0.09433067553214503
n= 6, NMAE = 0.09695131209556088
n= 7, NMAE = 0.09716602187019574
n= 8, NMAE = 0.09958211031615495
n= 9, NMAE = 0.09994172275692516
n= 10, NMAE = 0.10026170632319766
n= 11, NMAE = 0.10110382379546762
n= 12, NMAE = 0.1023607542553187
n= 13, NMAE = 0.10063615106295076
n= 14, NMAE = 0.10320397901529305
n= 15, NMAE = 0.10481972294111211
n= 16, NMAE = 0.10272400553663527
n= 17, NMAE = 0.104348317502294
n= 18, NMAE = 0.10582469552870148
n= 19, NMAE = 0.10751811966807232
n= 20, NMAE = 0.10951869874757478
n= 21, NMAE = 0.10936356884951366
n= 22, NMAE = 0.10932079258405396
n= 23, NMAE = 0.1113695614230916
n= 24, NMAE = 0.11291863854221096
n= 25, NMAE = 0.11117458098294249
n= 26, NMAE = 0.11391127460964802
n= 27, NMAE = 0.11465947499636181
n= 28, NMAE = 0.11436494756022214
n= 29, NMAE = 0.11634210687134738
n= 30, NMAE = 0.11654537690

In [42]:
# min_impurity_decrease 튜닝 -> 0.8 결정
for n in np.arange(0,5,0.1):
    rf2 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', max_depth = 203, max_features = 'log2',
                               min_samples_split = 9, min_samples_leaf = 1, min_impurity_decrease = n)
    rf2.fit(X_train, y_train)
    y_val_pred = rf2.predict(X_val)
    print('n= {}, NMAE = {}'.format(n,NMAE(y_val, y_val_pred)))

n= 0.0, NMAE = 0.09435490956953495
n= 0.1, NMAE = 0.09379587867634581
n= 0.2, NMAE = 0.09376517617256545
n= 0.30000000000000004, NMAE = 0.09485072030151516
n= 0.4, NMAE = 0.09446946629047724
n= 0.5, NMAE = 0.09496719437818965
n= 0.6000000000000001, NMAE = 0.09276306826278713
n= 0.7000000000000001, NMAE = 0.09252383944019776
n= 0.8, NMAE = 0.09180366329815516
n= 0.9, NMAE = 0.09474892564736026
n= 1.0, NMAE = 0.09436089827032984
n= 1.1, NMAE = 0.09443545761163637
n= 1.2000000000000002, NMAE = 0.09232728153024548
n= 1.3, NMAE = 0.09455933073848126
n= 1.4000000000000001, NMAE = 0.09362935793920164
n= 1.5, NMAE = 0.0937652023630801
n= 1.6, NMAE = 0.09213385689857172
n= 1.7000000000000002, NMAE = 0.09511184944878158
n= 1.8, NMAE = 0.09347493932038431
n= 1.9000000000000001, NMAE = 0.09281381346826541
n= 2.0, NMAE = 0.09220695239358662
n= 2.1, NMAE = 0.09236979438082833
n= 2.2, NMAE = 0.09539868383145304
n= 2.3000000000000003, NMAE = 0.09339334858072022
n= 2.4000000000000004, NMAE = 0.09397794

In [56]:
# max_depth = 203ver.
rf2 = RandomForestRegressor(n_estimators=1000, criterion='absolute_error', max_depth = 203, max_features = 'log2',
                               min_samples_split = 9, min_samples_leaf = 1, min_impurity_decrease = 0.8)
rf2.fit(X_train, y_train)
y_val_pred = rf2.predict(X_val)
y_train_pred = rf2.predict(X_train)
print(NMAE(y_val, y_val_pred))
print(NMAE(y_train, y_train_pred))

0.09331686565440872
0.06459896689407055


In [70]:
# max_depth = 11ver.
rf2 = RandomForestRegressor(n_estimators=1000, criterion='absolute_error', max_depth = 11, max_features = 'log2',
                               min_samples_split = 9, min_samples_leaf = 1, min_impurity_decrease = 0.8)
rf2.fit(X_train, y_train)
y_val_pred = rf2.predict(X_val)
y_train_pred = rf2.predict(X_train)
print(NMAE(y_val, y_val_pred))
print(NMAE(y_train, y_train_pred))

0.09330387959636144
0.06815443330063535


In [47]:
# 제출 파일 생성 - max_depth = 203ver.
y_pred = rf2.predict(X_t_scaled)
sub['target'] = y_pred
sub.to_csv('./sample_submission4.csv', index=False) # 제출 점수: NMAE 0.1030980356