# Airbnb 데이터를 이용한 회귀분석
#### (https://www.kaggle.com/stevezhenghp/airbnb-price-prediction)

# 1. 데이터 불러오기 및 전처리

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_excel('airbnb.xlsx')
data

### 1.1 “number_of_reviews”가 11개 이상인 데이터만 추출하시오.

In [None]:
data2 = data.copy()

In [None]:
data2 = data2[data2['number_of_reviews']>=11].reset_index(drop=True)
data2.shape

### 1.2 “property_type”은 ‘House’, ‘Aprtment’, ’Other’ 등의 3범주로 변환하시오.  I_house, I_apartment 등의 2개 더미변수를 생성하고, “property_type”은 삭제하시오.

In [None]:
data2['I_house']=np.where(data2['property_type'] == 'House', 1, 0)
data2['I_apt']=np.where(data2['property_type'] == 'Apartment', 1, 0)

In [None]:
print(data2['I_house'].value_counts())
print(data2['I_apt'].value_counts())
data2.drop(['property_type'], axis=1, inplace=True)

### 1.3	“room_type”은 ‘share room’=1, ‘private room’=2, ‘entire home/apt’=3 으로 정수형으로 변환하시오.

In [None]:
data2['room_type']=data2['room_type'].replace({'Shared room':1, 'Private room':2, 'Entire home/apt':3})
print(data2['room_type'].value_counts())

### 1.4	“amenities”는 amenities에 포함된 편의시설의 갯수로 정의하시오.

In [None]:
data2['amenities'] = data2['amenities'].str.count(',')+1
sns.histplot(data=data2, x='amenities')
plt.show()

### 1.5	“bed_type”은 ‘Real Bed’인 경우는 1, 그 외의 경우는 0으로 더미변수화 하시오.

In [None]:
data2['bed_type']=np.where(data2['bed_type'] == 'Real Bed', 1, 0)
data2['bed_type'].value_counts()

### 1.6 	“cancellation_policy”는 5개의 순서가 존재하는 범주형이므로, 이를 1,2,3,4,5의 정수형으로 변환하시오. 
* (flexible=1, moderate=2, strict=3, super_strict_30=4, super_strict60=5)

In [None]:
data2['cancellation_policy']=data2['cancellation_policy'].replace(
    {'flexible':1,'moderate':2,'strict':3,'super_strict_30':4,'super_strict_60':5})
print(data2['cancellation_policy'].value_counts())

### 1.7 	“cleaning fee” 는 더미변수화 하시오.

In [None]:
data2['cleaning_fee']=np.where(data2['cleaning_fee'] == True, 1, 0)
data2['cleaning_fee'].value_counts()

### 1.8 	“description” 변수는 문자열의 길이로 정의하시오. (더 긴 소개문을 제공한 곳은 더 비싼지 여부 확인해보기 위해)

In [None]:
data2['description'] = data2['description'].str.len()
sns.histplot(data=data2, x='description')
plt.show()

### 1.9 “host_identity_verified” 변수는 더미변수화 하시오.

In [None]:
data2['host_identity_verified']=np.where(data2['host_identity_verified'] == 't', 1, 0)
data2['host_identity_verified'].value_counts()

### 1.10	“instant_bookable” 변수는 더미변수화 하시오.

In [None]:
data2['instant_bookable']=np.where(data2['instant_bookable'] == 't', 1, 0)
data2['instant_bookable'].value_counts()

### 1.11	“latitude”와 “longitude”를 이용하여 “도심의 중심위치로부터의 거리” 라는 변수를 추가하시오.

|CITY|LAT|LONG|
|:-:|:-:|:-:|
|NYC|40.664167|-73.938611|
|SF|37.7793|-122.4192|
|DC|38.895|-77.036667|
|LA|34.05|-118.25|
|Chicago|41.881944|-87.627778|
|Boston|42.357778|-71.061667|
|||출처: GeoHack|

In [None]:
city_center = pd.DataFrame([['NYC', 40.664167, -73.938611], ['SF', 37.7793, -122.4192], ['DC', 38.895, -77.036667],
             ['LA', 34.05, -118.25], ['Chicago', 41.881944, -87.627778], ['Boston', 42.357778, -71.061667]],
            columns=['city', 'city_lat', 'city_long'])

In [None]:
data2 = pd.merge(data2, city_center, how='left', on='city')

In [None]:
data2['distance_from_city_center'] = np.sqrt((data2['latitude']-data2['city_lat'])**2
                                             +(data2['longitude']-data2['city_long'])**2)
data2

### 1.12	로그가격비(log_price_ratio)’ 변수를 생성하시오. 여기서, 가격비는 아래와 같다.
* $\text{로그가격비}=log(\frac{\text{원가격}}{\text{도시별 평균가격}})$
* 여기서 '원가격'$=e^{\text{log_price}}$, '도시별 평균가격'은 같은 도시내의 '원가격'의 평균값을 의미한다.

In [None]:
data2['price'] = np.exp(data2['log_price'])

In [None]:
avg_price_by_city = data2.groupby('city', as_index=False).mean()[['city', 'price']]
avg_price_by_city.rename(columns={'price':'avg_price_by_city'},inplace=True)
avg_price_by_city

In [None]:
data2 = pd.merge(data2, avg_price_by_city, how='left', on='city')

In [None]:
data2['log_price_ratio'] = np.log(data2['price']/data2['avg_price_by_city'])
data2

### 1.13	'id', 'first_review', 'host_has_profile_pic', 'host_since','last_review',             'latitude', 'longitude', 'city_lat', 'city_long', 'price', 'avg_price_by_city', 'name', 'neighbourhood', 'thumbnail_url', 'zipcode', 'city', 'log_price' 변수를 삭제하시오.

In [None]:
delete_var= ['id','first_review','host_has_profile_pic','host_since','last_review',
             'latitude','longitude','city_lat','city_long','price','avg_price_by_city',
             'name','neighbourhood','thumbnail_url','zipcode','city','log_price']
data2.drop(delete_var, axis=1, inplace=True)

### 1.14	결측치가 있는 데이터는 삭제하시오.

In [None]:
data2.isnull().sum()

In [None]:
data2 = data2.dropna().reset_index(drop=True)

In [None]:
data2.dtypes

In [None]:
data2.head()

In [None]:
data2.shape

# 2. 변수선택 및 데이터 분할
### 로그가격비를 종속변수로 하여 회귀분석을 수행한다.

In [None]:
X = data2.drop('log_price_ratio', axis=1)
y = data2['log_price_ratio']

### 2.1 Variable selection을 수행한다. 변수선택은 랜덤포레스트의 변수중요도를 이용한다, 변수중요도가 거의 없는 변수들을 제거하고 나서 진행한다.

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X, y)

In [None]:
sns.barplot(x=model.feature_importances_, y=X.columns)
plt.axvline(0.02, color='red')
plt.show()

In [None]:
xname = ['room_type', 'amenities', 'accommodates', 'bathrooms', 'description', 'number_of_reviews',
       'review_scores_rating', 'bedrooms', 'distance_from_city_center']
X=X[xname].copy()

### 2.2 데이터를 train:test = 5:5의 비율로 분할한다.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# 3. 회귀분석 9개 모형 수행

### 3.1	선형회귀분석 (statsmodels OLS) 

In [None]:
import statsmodels.api as sm

In [None]:
linear = sm.OLS(y_train, sm.add_constant(X_train))
model1 = linear.fit()
y_pred1 = model1.predict(sm.add_constant(X_test))
model1.summary()

**7개의 X변수가 유의함. room_type이 가장 유의함(넓은 면적을 사용할수록 시내평균가격보다 비쌈). 시내중심으로 멀어질수록 가격은 하락함**

### 3.2  DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
np.random.seed(0)
grid = {'ccp_alpha': np.arange(0.000, 0.005, 0.001)}
g_cv = GridSearchCV(DecisionTreeRegressor(random_state=0),
              param_grid=grid, cv=3, scoring='neg_mean_absolute_error')
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

In [None]:
model2 = DecisionTreeRegressor(ccp_alpha=0.001, random_state=0)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 8))
plot_tree(model2, feature_names=xname, filled=True, fontsize=12)
plt.show()

**room_type이 가장 먼저 나타남. house인 경우에는 화장실의 개수가 가격에 영향을 줌. private room인 경우에는 수용인원이 가격에 영향을 줌.**

### 3.3 MLPRegressor

**표준화**

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
np.random.seed(0)
grid = {'hidden_layer_sizes':[(3,), (4,), (5,), (3, 3), (3, 4), (3, 5), 
                              (4, 3), (4, 4), (4, 5), (5, 3), (5, 4), (5, 5)]}
g_cv = GridSearchCV(MLPRegressor(random_state=0, max_iter=1000),
                    param_grid=grid, cv=3, scoring='neg_mean_absolute_error')
g_cv.fit(X_train_scaled, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

In [None]:
model3 = MLPRegressor(hidden_layer_sizes=(4), random_state=0)
model3.fit(X_train_scaled, y_train)
y_pred3 = model3.predict(X_test_scaled)

### 3.4  SVR (linear)

In [None]:
from sklearn.svm import SVR

In [None]:
np.random.seed(0)
grid = {'C': np.arange(0.3, 0.33, 0.01)}
g_cv = GridSearchCV(SVR(kernel='linear', max_iter=1000), 
                    param_grid=grid, cv=3, scoring='neg_mean_absolute_error')
g_cv.fit(X_train_scaled, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

In [None]:
model4 = SVR(kernel='linear', C=0.3)
model4.fit(X_train_scaled, y_train)
y_pred4 = model4.predict(X_test_scaled)

### 3.5 SVR (rbf)

In [None]:
np.random.seed(0)
grid = {'C': np.arange(0.4, 0.5, 0.05)}
g_cv = GridSearchCV(SVR(kernel='rbf'), param_grid=grid, cv=3, scoring='neg_mean_absolute_error')
g_cv.fit(X_train_scaled, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

In [None]:
model5 = SVR(kernel='rbf', C=0.45)
model5.fit(X_train_scaled, y_train)
y_pred5 = model5.predict(X_test_scaled)

### 3.6 BaggingRegressor

In [None]:
from sklearn.ensemble import BaggingRegressor

In [None]:
model6 = BaggingRegressor(n_estimators=100, random_state=0)
model6.fit(X_train, y_train)
y_pred6 = model6.predict(X_test)

### 3.7 RandomForestRegressor

In [None]:
np.random.seed(0)
grid = {'max_features': ["sqrt",4,5,6]}
g_cv = GridSearchCV(RandomForestRegressor(n_estimators=100, random_state=0), 
                    param_grid=grid, cv=3, scoring='neg_mean_absolute_error')
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

In [None]:
model7 = RandomForestRegressor(n_estimators=100, max_features="sqrt", random_state=0)
model7.fit(X_train, y_train)
y_pred7 = model7.predict(X_test)

### 3.8 AdaBoostRegressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=10), n_estimators=100, random_state=0)
model8.fit(X_train, y_train)
y_pred8 = model8.predict(X_test)

### 3.9 GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
np.random.seed(0)
grid = {'max_depth': [3,4,5,6,7]}
g_cv = GridSearchCV(GradientBoostingRegressor(n_estimators=100, random_state=0), 
                    param_grid=grid, cv=3, scoring='neg_mean_absolute_error')
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

In [None]:
model9 = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=0)
model9.fit(X_train, y_train)
y_pred9 = model9.predict(X_test)

# 예측력 비교
**평가시 기준은 MAE와 예측 $R^2$를 사용한다.**

In [None]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape

**Lineplot을 그리는데, x축은 방법이름, y축은 MAE인 그래프로 그린다.**

In [None]:
mae_table = pd.DataFrame([['LinearRegression', mae(y_test, y_pred1)], 
                          ['DecisionTreeRegressor', mae(y_test, y_pred2)], 
                          ['MLPRegressor', mae(y_test, y_pred3)], 
                          ['SVR (linear)', mae(y_test, y_pred4)], 
                          ['SVR (rbf)', mae(y_test, y_pred5)], 
                          ['BaggingRegressor', mae(y_test, y_pred6)], 
                          ['RandomForestRegressor', mae(y_test, y_pred7)], 
                          ['AdaBoostRegressor', mae(y_test, y_pred8)], 
                          ['GradientBoostingRegressor', mae(y_test, y_pred9)]], 
                         columns=['Model', 'MAE'])

In [None]:
mae_table

In [None]:
plt.subplots(figsize=(8, 6))
g=sns.lineplot(data=mae_table, x="Model", y="MAE")
g.set_xticklabels(mae_table['Model'], rotation=30, horizontalalignment='right')
plt.show()

**Lineplot을 그리는데, x축은 방법이름, y축은 예측$R^2$인 그래프로 그린다.**

In [None]:
from sklearn.metrics import r2_score as r2
r2_table = pd.DataFrame([['LinearRegression', r2(y_test, y_pred1)], 
                           ['DecisionTreeRegressor', r2(y_test, y_pred2)], 
                           ['MLPRegressor', r2(y_test, y_pred3)], 
                           ['SVR (linear)', r2(y_test, y_pred4)],
                           ['SVR (rbf)', r2(y_test, y_pred5)], 
                           ['BaggingRegressor', r2(y_test, y_pred6)],
                           ['RandomForestRegressor', r2(y_test, y_pred7)], 
                           ['AdaBoostRegressor', r2(y_test, y_pred8)],
                           ['GradientBoostingRegressor', r2(y_test, y_pred9)]], 
                          columns=['Model', 'R2'])

In [None]:
r2_table

In [None]:
plt.subplots(figsize=(8, 6))
g=sns.lineplot(data=r2_table, x="Model", y="R2")
g.set_xticklabels(r2_table['Model'], rotation=30, horizontalalignment='right')
plt.show()