#### 이번 대회에서는 범주형 변수 전처리를 위해 One-Hot Encoding과 for문을 사용했습니다.

#### 이는 train data로 fit한 One-Hot Encoder로 test data를 transform할 경우,
#### train data에는 속하지 않은 데이터가 test data에 있을 가능성이 있어 에러가 발생할 수 있기 때문입니다.

#### 이를 방지하기 위해 예외적인 상황에 대처할 수 있는 코드를 삽입해서 이중 for문을 작성했습니다.
#### 참고해 주시길 바랍니다.

# Import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge

# Fixed RandomSeed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Data Load

In [3]:
train_df = pd.read_csv('./train.csv')

In [4]:
train_df

Unnamed: 0,ID,propertyType,bedrooms,latitude,longitude,suburbName,distanceMetro(km),distanceAirport(km),distanceHospital(km),distanceRailway(km),area(square_meters),monthlyRent(us_dollar)
0,Train_0000,Apartment,3,28.638710,77.295822,Delhi East,0.312579,22.646032,11.726966,7.352495,83.61,307
1,Train_0001,Independent Floor,1,28.498940,77.207191,Delhi South,2.486167,13.500583,7.527761,15.877066,83.61,110
2,Train_0002,Independent Floor,3,28.714123,77.154404,Delhi North,1.528794,18.918243,17.135939,10.315737,78.97,369
3,Train_0003,Independent Floor,3,28.704330,77.149956,Other,0.967121,17.749252,16.251937,9.797817,162.58,676
4,Train_0004,Apartment,4,28.577915,77.049446,Dwarka,0.834506,4.288189,15.541840,18.179806,218.32,418
...,...,...,...,...,...,...,...,...,...,...,...,...
8687,Train_8687,Apartment,1,28.602234,77.026001,Dwarka,0.005681,7.776390,18.212199,19.535831,46.45,159
8688,Train_8688,Apartment,1,28.644989,77.169296,Delhi Central,0.007987,12.969368,9.442664,5.039023,81.29,172
8689,Train_8689,Independent Floor,3,28.547377,77.259155,Delhi South,0.203502,17.094466,5.468956,11.109941,148.64,738
8690,Train_8690,Independent Floor,1,28.630501,77.277382,Delhi East,0.248603,20.628700,9.801128,5.679541,41.90,184


In [5]:
train_x = train_df.drop(columns=['ID', 'monthlyRent(us_dollar)'])
train_y = train_df['monthlyRent(us_dollar)']

In [6]:
test_x = pd.read_csv('./test.csv').drop(columns=['ID'])

In [7]:
test_x

Unnamed: 0,propertyType,bedrooms,latitude,longitude,suburbName,distanceMetro(km),distanceAirport(km),distanceHospital(km),distanceRailway(km),area(square_meters)
0,Independent House,1,28.644199,77.162407,Delhi Central,0.675741,12.480545,9.653289,5.706151,55.74
1,Independent Floor,3,28.724508,77.087051,West Delhi,1.992787,18.799390,21.086929,15.963967,88.26
2,Independent Floor,3,28.672693,77.103973,West Delhi,0.678442,13.190977,15.514042,11.916831,111.48
3,Independent Floor,1,28.645021,77.169235,Delhi Central,0.003422,12.968327,9.448341,5.045234,59.92
4,Independent Floor,1,28.607435,77.289627,Delhi East,0.299808,20.878677,9.180785,7.736478,27.87
...,...,...,...,...,...,...,...,...,...,...
8688,Apartment,3,28.586784,77.071671,Dwarka,1.060178,3.756045,13.500350,15.794588,148.64
8689,Independent Floor,3,28.646486,77.165459,Delhi Central,0.403543,12.863598,9.748500,5.426976,88.26
8690,Apartment,2,28.727852,77.086617,West Delhi,2.138730,19.169468,21.418370,16.214014,83.61
8691,Apartment,1,28.698778,77.145348,North Delhi,0.449243,17.014223,15.844727,9.709162,51.10


# Data Pre-processing

#### 범주형 변수는 크게 명목형 변수와 순서형 변수로 나눌 수 있습니다.
#### 순서형 변수의 경우 그 순서대로 수치값을 레이블로 부여하여 간단히 수치화 할 수 있지만,
#### 명목형 변수의 경우 값들의 순서 관계가 없어 수치 레이블링으로는 그 관계를 정확히 표현할 수 없습니다.
#### 그렇기에 명목형 변수의 경우 값들 각각을 새로운 컬럼으로 만들고, 원래 해당하던 값에는 1을,
#### 아닐 경우 0을 부여하는 One-Hot Encoding 방법이 존재합니다. 

In [8]:
# # qualitative column one-hot encoding
# qual_col = ['propertyType','suburbName']
# ohe = OneHotEncoder(sparse=False)

# for i in qual_col:
#     train_x = pd.concat([train_x, pd.DataFrame(ohe.fit_transform(train_x[[i]]), columns=ohe.categories_[0])], axis=1)
    
#     for qual_value in np.unique(test_x[i]): 
#         if qual_value not in np.unique(ohe.categories_): 
#             ohe.categories_ = np.append(ohe.categories_, qual_value)
#     # One Hot Encoder가 Test 데이터로부터 Fitting되는 것은 Data Leakage이므로, Test 데이터에는 Train 데이터로 Fitting된 One Hot Encoder로부터 transform만 수행되어야 합니다.
#     test_x = pd.concat([test_x, pd.DataFrame(ohe.transform(test_x[[i]]), columns=ohe.categories_[0])], axis=1)
    
# # train_x = train_x.drop(qual_col, axis=1)
# # test_x = test_x.drop(qual_col, axis=1)
# print('Done.')

In [9]:
from sklearn.preprocessing import LabelEncoder
qual_col = ['propertyType','suburbName']
le = LabelEncoder()
for i in qual_col:
    le.fit(train_x[i])
    train_x[i] = le.fit_transform(train_x[i])
    test_x[i] = le.fit_transform(test_x[i])

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train,x_test,y_train,y_test = train_test_split(train_x,train_y,test_size=0.1)

# Model Hyperparameter Setting

#### 대부분의 모델들은 사람이 직접 설정할 수 있는 Hyperparameter를 가지고 있습니다.
#### 이런 Hyperparameter에 어떤 값이 설정되는가에 따라 모델의 성능은 크게 차이나게 됩니다. 
#### 본 Baseline에서 제공한 Ridge Regression 모델에서는 alpha를 Hyperparameter로 제공했습니다. 
#### alpha는 모델의 규제항으로, 모델의 오버피팅을 방지하는 역할을 합니다.


In [12]:
train_x.columns

Index(['propertyType', 'bedrooms', 'latitude', 'longitude', 'suburbName',
       'distanceMetro(km)', 'distanceAirport(km)', 'distanceHospital(km)',
       'distanceRailway(km)', 'area(square_meters)'],
      dtype='object')

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [25]:
from sklearn.metrics import mean_absolute_error

bag = []
x_train,x_test,y_train,y_test = train_test_split(train_x,train_y,test_size=0.03,random_state=10)

# for i in range(10):
Model = GradientBoostingRegressor(random_state=i) #alpha의 값을 바꿔 규제 정도를 조절할 수 있습니다.
Model.fit(train_x, train_y)
# preds = Model.predict(x_test)
# mae = mean_absolute_error(preds,y_test)


    

In [15]:
bag

[GradientBoostingRegressor(random_state=0),
 GradientBoostingRegressor(random_state=1),
 GradientBoostingRegressor(random_state=2),
 GradientBoostingRegressor(random_state=3),
 GradientBoostingRegressor(random_state=4),
 GradientBoostingRegressor(random_state=5),
 GradientBoostingRegressor(random_state=6),
 GradientBoostingRegressor(random_state=7),
 GradientBoostingRegressor(random_state=8),
 GradientBoostingRegressor(random_state=9)]

# Model Fit

In [26]:
submit = pd.read_csv('./sample_submission.csv')
submit['monthlyRent(us_dollar)'] = Model.predict(test_x)
submit = submit.set_index('ID')
submit.to_csv('./submit.csv')

In [17]:
# for i in range(len(bag)):
#     submit[f'{i}'] = bag[i].predict(test_x)

In [18]:
# submit.loc[:,[str(i) for i in range(len(bag))]].mean(axis=1)

In [19]:
# submit['monthlyRent(us_dollar)'] = submit.loc[:,[str(i) for i in range(len(bag))]].mean(axis=1)

In [20]:
# submit = submit.loc[:,['ID','monthlyRent(us_dollar)']]
# submit = submit.set_index('ID')
# submit.to_csv('./submit.csv')

# Prediction

# Submit

In [21]:
# submit = pd.read_csv('./sample_submission.csv')

In [22]:
# submit['monthlyRent(us_dollar)'] = preds
# submit.head()

In [23]:
# submit.to_csv('./submit.csv', index=False)