In [2]:
import numpy as np 
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
train_dataset = pd.read_csv("./resources/train.csv")
test_dataset = pd.read_csv("./resources/test.csv")

train_label_dataset = train_dataset['SalePrice'] 

In [4]:
train_dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [275]:
train_dataset[['SalePrice']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SalePrice,1460.0,180921.19589,79442.502883,34900.0,129975.0,163000.0,214000.0,755000.0


In [276]:
remove_columns = set()
remove_columns.add('SalePrice')

In [277]:
def sale_price_by_column(column: str, is_remove: bool = False):
    if is_remove:
        remove_columns.add(column)
    return train_dataset[[column,'SalePrice']].groupby([column]).describe()

In [278]:
# 차도 유형 - 포장 or 비포장
# 특성 제거 : Grvl 이 6개밖에 없어서 비율이 현저히 적다고 판단 
sale_price_by_column('Street', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Street,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Grvl,6.0,130190.5,65446.253991,55993.0,88250.0,114250.0,169650.0,228950.0
Pave,1454.0,181130.538514,79446.597317,34900.0,130000.0,163000.0,214000.0,755000.0


In [279]:
# 골목길 유형 - 포장 or 비포장
sale_price_by_column('Alley')

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Alley,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Grvl,50.0,122219.08,34780.780734,52500.0,105312.5,119500.0,136750.0,256000.0
Pave,41.0,168000.585366,38370.375243,40000.0,151000.0,172500.0,185000.0,265979.0


In [280]:
# LotShape
# 특성 제거 : IR1 min이 52000이고, max가 75만이기 때문에 해당 특성이 price에 영향을 준다는 신뢰가 낮다고 판단
sale_price_by_column('LotShape', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
LotShape,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
IR1,484.0,206101.665289,85858.489559,52000.0,150000.0,189000.0,239000.0,755000.0
IR2,41.0,239833.365854,99669.427362,110000.0,175000.0,221000.0,250000.0,538000.0
IR3,10.0,216036.5,82540.334855,73000.0,167875.0,203570.0,265000.0,375000.0
Reg,925.0,164754.818378,69673.427215,34900.0,120000.0,146000.0,188000.0,582933.0


In [281]:
# LandContour - 평탄함 정도
sale_price_by_column('LandContour')

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
LandContour,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bnk,63.0,143104.079365,49361.244074,52500.0,113000.0,139400.0,171250.0,315000.0
HLS,50.0,231533.94,101790.139741,82500.0,151750.0,222250.0,281347.25,538000.0
Low,36.0,203661.111111,83935.35362,39300.0,143000.0,190000.0,263750.0,385000.0
Lvl,1311.0,180183.746758,78463.567918,34900.0,130000.0,162900.0,212000.0,755000.0


In [282]:
# Utilities 
# 특성 제거 : NoSeWa 카운트가 1밖에 안돼서 너무 적다고 판단
sale_price_by_column('Utilities', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Utilities,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
AllPub,1459.0,180950.95682,79461.599814,34900.0,129950.0,163000.0,214000.0,755000.0
NoSeWa,1.0,137500.0,,137500.0,137500.0,137500.0,137500.0,137500.0


In [283]:
# LotConfig 
# 특성 제거 : min과 Max 차이 
sale_price_by_column('LotConfig', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
LotConfig,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Corner,263.0,181623.425856,84466.041222,52500.0,129250.0,160000.0,220000.0,755000.0
CulDSac,94.0,223854.617021,93117.546586,84000.0,156475.0,199262.0,269342.5,625000.0
FR2,47.0,177934.574468,62788.926829,81000.0,143500.0,165000.0,194500.0,394617.0
FR3,4.0,208475.0,78379.222374,128000.0,169925.0,195450.0,234000.0,315000.0
Inside,1052.0,176938.047529,76426.805787,34900.0,128000.0,159697.5,207125.0,611657.0


In [284]:
# LandSlope - 경사도
# 특성 제거 : min과 Max 차이 
sale_price_by_column('LandSlope', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
LandSlope,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Gtl,1382.0,179956.799566,78669.369151,34900.0,129900.0,161875.0,212000.0,755000.0
Mod,65.0,196734.138462,92375.358116,39300.0,130000.0,186700.0,259500.0,538000.0
Sev,13.0,204379.230769,86729.922409,61000.0,143000.0,185000.0,260000.0,375000.0


In [285]:
# Neighborhood - 중간값 확인해보면 max 가 높은 순으로 정렬돼있음 
sale_price_by_column('Neighborhood')

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Neighborhood,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Blmngtn,17.0,194870.882353,30393.229219,159895.0,174000.0,191000.0,213490.0,264561.0
Blueste,2.0,137500.0,19091.883092,124000.0,130750.0,137500.0,144250.0,151000.0
BrDale,16.0,104493.75,14330.176493,83000.0,91000.0,106000.0,118000.0,125000.0
BrkSide,58.0,124834.051724,40348.68927,39300.0,100500.0,124300.0,141175.0,223500.0
ClearCr,28.0,212565.428571,50231.538993,130000.0,183750.0,200250.0,242225.0,328000.0
CollgCr,150.0,197965.773333,51403.666438,110000.0,152958.75,197200.0,225725.0,424870.0
Crawfor,51.0,210624.72549,68866.395472,90350.0,159250.0,200624.0,239000.0,392500.0
Edwards,100.0,128219.7,43208.616459,58500.0,101500.0,121750.0,145225.0,320000.0
Gilbert,79.0,192854.506329,35986.779085,141000.0,174000.0,181000.0,197200.0,377500.0
IDOTRR,37.0,100123.783784,33376.710117,34900.0,81000.0,103000.0,120500.0,169500.0


In [286]:
# Condition1 
# 특성 제거 : 중간값 및 max 값 랭크가 불일치
sale_price_by_column('Condition1', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Condition1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Artery,48.0,135091.666667,66226.660548,66500.0,105000.0,119550.0,143000.0,475000.0
Feedr,81.0,142475.481481,42157.290117,40000.0,121600.0,140000.0,167500.0,244600.0
Norm,1260.0,184495.492063,81256.980125,34900.0,131500.0,166500.0,219500.0,755000.0
PosA,8.0,225875.0,52348.932313,180000.0,188750.0,212500.0,244000.0,335000.0
PosN,19.0,215184.210526,65256.536609,109500.0,167250.0,200000.0,252250.0,385000.0
RRAe,11.0,138400.0,24030.813553,87000.0,127750.0,142500.0,156500.0,171000.0
RRAn,26.0,184396.615385,66177.92207,79500.0,152393.75,171495.0,190105.0,423000.0
RRNe,2.0,190750.0,5303.300859,187000.0,188875.0,190750.0,192625.0,194500.0
RRNn,5.0,212400.0,93823.23806,110000.0,128000.0,214000.0,290000.0,320000.0


In [287]:
# Condition2 
# 특성 제거 : Norm 제외 개수가 너무 없음 
sale_price_by_column('Condition2', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Condition2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Artery,2.0,106500.0,16263.455967,95000.0,100750.0,106500.0,112250.0,118000.0
Feedr,6.0,121166.666667,33544.994659,79500.0,95500.0,127500.0,137000.0,167500.0
Norm,1445.0,181169.405536,79337.735547,34900.0,130000.0,163500.0,214000.0,755000.0
PosA,1.0,325000.0,,325000.0,325000.0,325000.0,325000.0,325000.0
PosN,2.0,284875.0,141598.132933,184750.0,234812.5,284875.0,334937.5,385000.0
RRAe,1.0,190000.0,,190000.0,190000.0,190000.0,190000.0,190000.0
RRAn,1.0,136905.0,,136905.0,136905.0,136905.0,136905.0,136905.0
RRNn,2.0,96750.0,39951.533137,68500.0,82625.0,96750.0,110875.0,125000.0


In [288]:
# BldgType -
sale_price_by_column('BldgType')

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
BldgType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1Fam,1220.0,185763.807377,82648.502922,34900.0,131475.0,167900.0,222000.0,755000.0
2fmCon,31.0,128432.258065,35458.545158,55000.0,106875.0,127500.0,142500.0,228950.0
Duplex,52.0,133541.076923,27833.249197,82000.0,118375.0,135980.0,145000.0,206300.0
Twnhs,43.0,135911.627907,41013.22208,75000.0,95750.0,137500.0,168750.0,230000.0
TwnhsE,114.0,181959.342105,60626.108918,75500.0,143187.5,172200.0,207375.0,392500.0


In [289]:
# HouseStyle - 
sale_price_by_column('HouseStyle')

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
HouseStyle,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1.5Fin,154.0,143116.74026,54277.941119,37900.0,114625.0,132000.0,159325.5,410000.0
1.5Unf,14.0,110150.0,19036.47309,76000.0,98175.0,111250.0,120500.0,139400.0
1Story,726.0,175985.477961,77055.715011,34900.0,127000.0,154750.0,209350.0,611657.0
2.5Fin,8.0,220000.0,118211.976671,104000.0,164250.0,194000.0,223750.0,475000.0
2.5Unf,11.0,157354.545455,63934.128032,101000.0,125000.0,133900.0,163500.0,325000.0
2Story,445.0,210051.764045,87339.21357,40000.0,159500.0,190000.0,240000.0,755000.0
SFoyer,37.0,135074.486486,30480.898192,75500.0,127500.0,135960.0,148000.0,206300.0
SLvl,65.0,166703.384615,38305.161339,91000.0,145000.0,164500.0,178000.0,345000.0


In [290]:
# RoofStyle
# 특성 제거 : min과 Max 차이 
sale_price_by_column('RoofStyle', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
RoofStyle,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Flat,13.0,194690.0,62522.963515,82000.0,143000.0,185000.0,242000.0,274970.0
Gable,1141.0,171483.956179,66331.237296,34900.0,128000.0,160000.0,202500.0,755000.0
Gambrel,11.0,148909.090909,67013.527807,40000.0,105500.0,139000.0,193750.0,259500.0
Hip,286.0,218876.933566,111549.603563,55000.0,139675.0,176500.0,277875.0,745000.0
Mansard,7.0,180568.428571,58057.624627,100000.0,145500.0,175000.0,216000.0,265979.0
Shed,2.0,225000.0,49497.474683,190000.0,207500.0,225000.0,242500.0,260000.0


In [291]:
# RoofMatl
# 특성 제거 : WdShngl 제외 개수가 너무 없음 
sale_price_by_column('RoofMatl', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
RoofMatl,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ClyTile,1.0,160000.0,,160000.0,160000.0,160000.0,160000.0,160000.0
CompShg,1434.0,179803.679219,77722.388636,34900.0,129900.0,162000.0,213000.0,745000.0
Membran,1.0,241500.0,,241500.0,241500.0,241500.0,241500.0,241500.0
Metal,1.0,180000.0,,180000.0,180000.0,180000.0,180000.0,180000.0
Roll,1.0,137000.0,,137000.0,137000.0,137000.0,137000.0,137000.0
Tar&Grv,11.0,185406.363636,65430.14172,82000.0,136000.0,167000.0,249000.0,274970.0
WdShake,5.0,241400.0,36218.779659,190000.0,228000.0,242000.0,260000.0,287000.0
WdShngl,6.0,390250.0,206969.019421,168500.0,278500.0,332500.0,452500.0,755000.0


In [292]:
# Exterior1st - 중간값 비슷
sale_price_by_column('Exterior1st')

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Exterior1st,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
AsbShng,20.0,107385.55,33756.48066,35311.0,85750.0,108000.0,133500.0,165500.0
AsphShn,1.0,100000.0,,100000.0,100000.0,100000.0,100000.0,100000.0
BrkComm,2.0,71000.0,15556.349186,60000.0,65500.0,71000.0,76500.0,82000.0
BrkFace,50.0,194573.0,82841.91437,40000.0,134525.0,165750.0,245125.0,430000.0
CBlock,1.0,105000.0,,105000.0,105000.0,105000.0,105000.0,105000.0
CemntBd,61.0,231690.655738,120575.621318,75000.0,119500.0,236500.0,303477.0,556581.0
HdBoard,222.0,163077.45045,66305.714164,83000.0,129625.0,149900.0,179900.0,755000.0
ImStucc,1.0,262000.0,,262000.0,262000.0,262000.0,262000.0,262000.0
MetalSd,220.0,149422.177273,54776.40899,62383.0,117750.0,139000.0,164775.0,392000.0
Plywood,108.0,175942.37963,49497.383293,82500.0,143437.5,167450.0,197500.0,345000.0


In [293]:
# Exterior2nd
# 특성 제거 : 카운트가 비교적 적은 것이 영향을 많이 주고, 표준편차 큼
sale_price_by_column('Exterior2nd', True)

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Exterior2nd,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
AsbShng,20.0,114060.55,42315.043608,35311.0,94000.0,111000.0,135500.0,225000.0
AsphShn,3.0,138000.0,37509.998667,100000.0,119500.0,139000.0,157000.0,175000.0
Brk Cmn,7.0,126714.285714,38693.084161,60000.0,114000.0,147000.0,148500.0,155000.0
BrkFace,25.0,195818.0,95097.551756,40000.0,137000.0,160000.0,250000.0,430000.0
CBlock,1.0,105000.0,,105000.0,105000.0,105000.0,105000.0,105000.0
CmentBd,60.0,230093.833333,116140.396221,75000.0,118375.0,238750.0,304082.75,556581.0
HdBoard,207.0,167661.565217,70061.078357,83000.0,130500.0,155000.0,182500.0,755000.0
ImStucc,10.0,252070.0,193176.672447,88000.0,131250.0,187600.0,305500.0,745000.0
MetalSd,214.0,149803.172897,55078.59531,62383.0,118625.0,138750.0,164525.0,392000.0
Other,1.0,319000.0,,319000.0,319000.0,319000.0,319000.0,319000.0


In [294]:
# MasVnrType
sale_price_by_column('MasVnrType')

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
MasVnrType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
BrkCmn,15.0,146318.066667,46187.583632,89471.0,114250.0,139000.0,163950.0,277000.0
BrkFace,445.0,204691.87191,81214.293554,75000.0,149300.0,181000.0,236000.0,755000.0
Stone,128.0,265583.625,99940.156577,119000.0,194650.0,246839.0,312779.0,611657.0


In [295]:
# ExterQual - max, min , 중간값 명확함
sale_price_by_column('ExterQual')

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
ExterQual,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Ex,52.0,367360.961538,116401.2642,160000.0,311404.0,364606.5,428788.5,755000.0
Fa,14.0,87985.214286,39826.918794,39300.0,60250.0,82250.0,102000.0,200000.0
Gd,488.0,231633.510246,71188.873899,52000.0,185000.0,220000.0,265984.25,745000.0
TA,906.0,144341.313466,42471.815703,34900.0,118589.5,139450.0,165500.0,381000.0


In [296]:
# ExterCond
sale_price_by_column('ExterCond')

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
ExterCond,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Ex,3.0,201333.333333,109235.220205,118000.0,139500.0,161000.0,243000.0,325000.0
Fa,28.0,102595.142857,40094.38394,39300.0,65500.0,95750.0,137750.0,169500.0
Gd,146.0,168897.568493,72608.303632,68400.0,128625.0,151250.0,187375.0,625000.0
Po,1.0,76500.0,,76500.0,76500.0,76500.0,76500.0,76500.0
TA,1282.0,184034.896256,79806.257233,34900.0,131100.0,167370.0,217334.25,755000.0


In [297]:
from sklearn.model_selection import train_test_split

##########데이터 로드

x_data = np.array([
    [2, 1],
    [3, 2],
    [3, 4],
    [5, 5],
    [7, 5],
    [2, 5],
    [8, 9],
    [9, 10],
    [6, 12],
    [9, 2],
    [6, 10],
    [2, 4]
])
y_data = np.array([3, 5, 7, 10, 12, 7, 13, 13, 12, 13, 12, 6])

##########데이터 분석

##########데이터 전처리

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=777)

##########모델 생성

model = GradientBoostingRegressor()

##########모델 학습

model.fit(x_train, y_train)

##########모델 검증

print(model.score(x_train, y_train)) #

print(model.score(x_test, y_test)) #0.7421680021828538

##########모델 예측

x_test = np.array([
    [4, 6]
])

y_predict = model.predict(x_test)

print(y_predict[0]) #6.497766320678856

0.9999999992944921
0.7309323996910557
6.449858454393457


### Encoding

In [314]:
from sklearn.preprocessing import LabelEncoder


label_encoder= LabelEncoder()
all_dataset = train_dataset._append(test_dataset)

for column in all_dataset.columns:
    if(column == 'Id'):
        continue
    encoded = label_encoder.fit_transform(all_dataset[column])
    all_dataset[column] = encoded
    

In [315]:
train_length = 1460
encoded_train_dataset = all_dataset.iloc[:train_length]
encoded_test_dataset = all_dataset.iloc[train_length:]

In [316]:
filtered_train_dataset = encoded_train_dataset.drop(columns=list(remove_columns))
filtered_test_dataset = encoded_test_dataset.drop(columns=list(remove_columns))

In [317]:
encoded_test_dataset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,0,2,56,1329,1,2,3,3,0,...,0,3,2,4,0,5,4,8,4,663
1,1462,0,3,57,1671,1,2,0,3,0,...,0,3,4,0,35,5,4,8,4,663
2,1463,5,3,50,1635,1,2,0,3,0,...,0,3,2,4,0,2,4,8,4,663
3,1464,5,3,54,979,1,2,0,3,0,...,0,3,4,4,0,5,4,8,4,663
4,1465,11,3,19,178,1,2,0,1,0,...,0,3,4,4,0,0,4,8,4,663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,13,4,0,23,1,2,3,3,0,...,0,3,4,4,0,5,0,8,4,663
1455,2916,13,4,0,20,1,2,3,3,0,...,0,3,4,4,0,3,0,8,0,663
1456,2917,0,3,121,1863,1,2,3,3,0,...,0,3,4,4,0,8,0,8,0,663
1457,2918,9,3,38,1090,1,2,3,3,0,...,0,3,2,2,17,6,0,8,4,663


### 결측값 채우기

In [318]:
filtered_train_dataset.fillna('')
encoded_test_dataset.fillna('')

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,0,2,56,1329,1,2,3,3,0,...,0,3,2,4,0,5,4,8,4,663
1,1462,0,3,57,1671,1,2,0,3,0,...,0,3,4,0,35,5,4,8,4,663
2,1463,5,3,50,1635,1,2,0,3,0,...,0,3,2,4,0,2,4,8,4,663
3,1464,5,3,54,979,1,2,0,3,0,...,0,3,4,4,0,5,4,8,4,663
4,1465,11,3,19,178,1,2,0,1,0,...,0,3,4,4,0,0,4,8,4,663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,13,4,0,23,1,2,3,3,0,...,0,3,4,4,0,5,0,8,4,663
1455,2916,13,4,0,20,1,2,3,3,0,...,0,3,4,4,0,3,0,8,0,663
1456,2917,0,3,121,1863,1,2,3,3,0,...,0,3,4,4,0,8,0,8,0,663
1457,2918,9,3,38,1090,1,2,3,3,0,...,0,3,2,2,17,6,0,8,4,663


In [319]:
model = GradientBoostingRegressor()

model.fit(filtered_train_dataset, train_label_dataset)
# print(model.score(x_train, y_train))

In [320]:
model.predict(filtered_test_dataset)

array([126112.61035801, 162544.82095622, 180373.09477691, ...,
       169584.50897565, 111628.36749524, 241594.90305487])