In [1]:
import os
import pandas as pd
import warnings 
import numpy as np
from matplotlib import rc, rcParams
import matplotlib.pyplot as plt

warnings.filterwarnings(action='ignore')
rc('font',family='AppleGothic')
rcParams['axes.unicode_minus']=False

In [5]:
path = '../data/'

train = pd.read_csv(path+'5_train.csv')
test = pd.read_csv(path+'5_test.csv')

train.shape, test.shape

((1437, 91), (518, 89))

In [6]:
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '공가수', '자격유형', '버스정류장',
       '단지내주차면수', 'ERROR1', '임대건물구분_lb', '상가', '아파트', '지역_lb', '강원도', '경기도',
       '경상남도', '경상북도', '광주광역시', '대구광역시', '대전광역시', '부산광역시', '서울특별시', '세종특별자치시',
       '울산광역시', '전라남도', '전라북도', '제주특별자치도', '충청남도', '충청북도', '공급유형_lb', '공공분양',
       '공공임대(10년)', '공공임대(50년)', '공공임대(5년)', '공공임대(분납)', '국민임대', '영구임대',
       '임대상가', '장기전세', '행복주택', '자격유형_lb', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'ERROR1_lb', '나머지', '분양상가',
       '분양아파트', '총세대수_lb', '총세대수_lb_A1', '총세대수_lb_B1', '총세대수_lb_C1',
       '총세대수_lb_D1', '총세대수_lb_E1', '전용면적_5', '전용면적_15', '전용면적_20', '전용면적_25',
       '전용면적_30', '전용면적_35', '전용면적_40', '전용면적_45', '전용면적_50', '전용면적_55',
       '전용면적_60', '전용면적_65', '전용면적_70', '전용면적_75', '전용면적_80', '전용면적_100',
       '지역_cat', '등록차량수', '전용면적별세대수', '면적_세대수_곱', '면적_세대수_곱_총합', '면적_세대수_비율',
       '전용면적_비례_차량수'],
      dtype='object')

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.model_selection import cross_val_score

In [21]:
sel = ['단지내주차면수', '전용면적별세대수', '면적_세대수_비율', '면적_세대수_곱_총합']
X = train[sel]
y = train[['전용면적_비례_차량수']]

# 피처스케일링
scaler = MinMaxScaler() # MinMaxScaler StandardScaler 
X_nor = scaler.fit_transform(X)
X_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X_nor)  

# X_all = np.concatenate((X_poly, X_nor[:,6:]), axis=1)

# 모델 
lr = LinearRegression()
rf = RandomForestRegressor()
lasso = Lasso()
ridge = Ridge()

# 회귀는 교차검증
rf_scores = cross_val_score(rf, X_poly, y,scoring='neg_mean_absolute_error', cv=5)
lr_scores = cross_val_score(lr, X_poly, y,scoring='neg_mean_absolute_error', cv=5)
lasso_scores = cross_val_score(lasso, X_poly, y,scoring='neg_mean_absolute_error', cv=5)
ridge_scores = cross_val_score(ridge, X_poly, y,scoring='neg_mean_absolute_error', cv=5)


rf_score = np.abs(rf_scores.mean())
lr_score = np.abs(lr_scores.mean())
lasso_score = np.abs(lasso_scores.mean())
ridge_score = np.abs(ridge_scores.mean())


print("RandomForestRegressor : ", rf_score)
print("LinearRegression :", lr_score)
print("Lasso (alpha=defalt) : ", lasso_score)
print("Ridge (alpha=defalt) : ", ridge_score)


RandomForestRegressor :  38.42388146996767
LinearRegression : 39.23168482944034
Lasso (alpha=defalt) :  42.2018215302114
Ridge (alpha=defalt) :  41.507053807807516


In [22]:
sel = ['단지내주차면수', '전용면적별세대수', '면적_세대수_비율', '면적_세대수_곱_총합', 
      '상가','아파트']
X = train[sel]
y = train[['전용면적_비례_차량수']]

# 피처스케일링
scaler = MinMaxScaler() # MinMaxScaler StandardScaler 
X_nor = scaler.fit_transform(X)
tmp_X_nor = X_nor[:,:4]
X_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(tmp_X_nor)  

X_all = np.concatenate((X_poly, X_nor[:,4:]), axis=1)

# 모델 
lr = LinearRegression()
rf = RandomForestRegressor()
lasso = Lasso()
ridge = Ridge()

# 회귀는 교차검증
rf_scores = cross_val_score(rf, X_all, y,scoring='neg_mean_absolute_error', cv=5)
lr_scores = cross_val_score(lr, X_all, y,scoring='neg_mean_absolute_error', cv=5)
lasso_scores = cross_val_score(lasso, X_all, y,scoring='neg_mean_absolute_error', cv=5)
ridge_scores = cross_val_score(ridge, X_all, y,scoring='neg_mean_absolute_error', cv=5)


rf_score = np.abs(rf_scores.mean())
lr_score = np.abs(lr_scores.mean())
lasso_score = np.abs(lasso_scores.mean())
ridge_score = np.abs(ridge_scores.mean())


print("RandomForestRegressor : ", rf_score)
print("LinearRegression :", lr_score)
print("Lasso (alpha=defalt) : ", lasso_score)
print("Ridge (alpha=defalt) : ", ridge_score)


RandomForestRegressor :  38.43752415714088
LinearRegression : 39.04197667137092
Lasso (alpha=defalt) :  42.20213487950447
Ridge (alpha=defalt) :  41.31244823363296


In [23]:
sel = ['단지내주차면수', '전용면적별세대수', '면적_세대수_비율', '면적_세대수_곱_총합', 
      '상가','아파트',
      '공공분양']
X = train[sel]
y = train[['전용면적_비례_차량수']]

# 피처스케일링
scaler = MinMaxScaler() # MinMaxScaler StandardScaler 
X_nor = scaler.fit_transform(X)
tmp_X_nor = X_nor[:,:4]
X_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(tmp_X_nor)  

X_all = np.concatenate((X_poly, X_nor[:,4:]), axis=1)

# 모델 
lr = LinearRegression()
rf = RandomForestRegressor()
lasso = Lasso()
ridge = Ridge()

# 회귀는 교차검증
rf_scores = cross_val_score(rf, X_all, y,scoring='neg_mean_absolute_error', cv=5)
lr_scores = cross_val_score(lr, X_all, y,scoring='neg_mean_absolute_error', cv=5)
lasso_scores = cross_val_score(lasso, X_all, y,scoring='neg_mean_absolute_error', cv=5)
ridge_scores = cross_val_score(ridge, X_all, y,scoring='neg_mean_absolute_error', cv=5)


rf_score = np.abs(rf_scores.mean())
lr_score = np.abs(lr_scores.mean())
lasso_score = np.abs(lasso_scores.mean())
ridge_score = np.abs(ridge_scores.mean())


print("RandomForestRegressor : ", rf_score)
print("LinearRegression :", lr_score)
print("Lasso (alpha=defalt) : ", lasso_score)
print("Ridge (alpha=defalt) : ", ridge_score)


RandomForestRegressor :  38.32662095070479
LinearRegression : 38.893508699604425
Lasso (alpha=defalt) :  42.20213487950447
Ridge (alpha=defalt) :  41.2534070621646


In [34]:
train['총세대수_lb'].shape

(1437,)

In [37]:
# train['총세대수_lb'].unique()

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
lbs = encoder.fit_transform(train['총세대수_lb'])
train['총세대수lb'] = lbs
train['총세대수lb'].dtype

dtype('int64')

In [38]:
sel = ['단지내주차면수', '전용면적별세대수', '면적_세대수_비율', '면적_세대수_곱_총합', 
      '상가','아파트',
      '공공분양','총세대수lb']

X = train[sel]
y = train[['전용면적_비례_차량수']]

# 피처스케일링
scaler = MinMaxScaler() # MinMaxScaler StandardScaler 
X_nor = scaler.fit_transform(X)
tmp_X_nor = X_nor[:,:4]
X_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(tmp_X_nor)  

X_all = np.concatenate((X_poly, X_nor[:,4:]), axis=1)

# 모델 
lr = LinearRegression()
rf = RandomForestRegressor()
lasso = Lasso()
ridge = Ridge()

# 회귀는 교차검증
rf_scores = cross_val_score(rf, X_all, y,scoring='neg_mean_absolute_error', cv=5)
lr_scores = cross_val_score(lr, X_all, y,scoring='neg_mean_absolute_error', cv=5)
lasso_scores = cross_val_score(lasso, X_all, y,scoring='neg_mean_absolute_error', cv=5)
ridge_scores = cross_val_score(ridge, X_all, y,scoring='neg_mean_absolute_error', cv=5)


rf_score = np.abs(rf_scores.mean())
lr_score = np.abs(lr_scores.mean())
lasso_score = np.abs(lasso_scores.mean())
ridge_score = np.abs(ridge_scores.mean())


print("RandomForestRegressor : ", rf_score)
print("LinearRegression :", lr_score)
print("Lasso (alpha=defalt) : ", lasso_score)
print("Ridge (alpha=defalt) : ", ridge_score)


RandomForestRegressor :  38.44793215328566
LinearRegression : 39.01854796859032
Lasso (alpha=defalt) :  42.20190381817118
Ridge (alpha=defalt) :  41.385639428783925


In [39]:
sel = ['단지내주차면수', '전용면적별세대수', '면적_세대수_비율', '면적_세대수_곱_총합', 
      '상가','아파트',
      '공공분양', '총세대수_lb_A1', '총세대수_lb_B1', '총세대수_lb_C1',
       '총세대수_lb_D1', '총세대수_lb_E1']

X = train[sel]
y = train[['전용면적_비례_차량수']]

# 피처스케일링
scaler = MinMaxScaler() # MinMaxScaler StandardScaler 
X_nor = scaler.fit_transform(X)
tmp_X_nor = X_nor[:,:4]
X_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(tmp_X_nor)  

X_all = np.concatenate((X_poly, X_nor[:,4:]), axis=1)

# 모델 
lr = LinearRegression()
rf = RandomForestRegressor()
lasso = Lasso()
ridge = Ridge()

# 회귀는 교차검증
rf_scores = cross_val_score(rf, X_all, y,scoring='neg_mean_absolute_error', cv=5)
lr_scores = cross_val_score(lr, X_all, y,scoring='neg_mean_absolute_error', cv=5)
lasso_scores = cross_val_score(lasso, X_all, y,scoring='neg_mean_absolute_error', cv=5)
ridge_scores = cross_val_score(ridge, X_all, y,scoring='neg_mean_absolute_error', cv=5)


rf_score = np.abs(rf_scores.mean())
lr_score = np.abs(lr_scores.mean())
lasso_score = np.abs(lasso_scores.mean())
ridge_score = np.abs(ridge_scores.mean())


print("RandomForestRegressor : ", rf_score)
print("LinearRegression :", lr_score)
print("Lasso (alpha=defalt) : ", lasso_score)
print("Ridge (alpha=defalt) : ", ridge_score)


RandomForestRegressor :  38.393839834869894
LinearRegression : 5010119289.548132
Lasso (alpha=defalt) :  42.142698275384475
Ridge (alpha=defalt) :  41.90796763749767
