In [1]:
# Ref.: https://github.com/arybressane/CEBD1260-BIG-DATA-ANALYTICS

In [2]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# knn model
from sklearn.neighbors import KNeighborsRegressor

# decision tree model
from sklearn.tree import DecisionTreeRegressor

# sklearn :: evaluation metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


sns.set_style('whitegrid')

# Problem definition

Apply regression models to predict Airbnb prices

# Load the data

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print("df_train shape:"+str(df_train.shape) )
print("df_test shape:"+str(df_test.shape) )

df_train shape:(51000, 29)
df_test shape:(23111, 28)


In [4]:
print(df_train.columns)

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')


In [5]:
df_train.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,13662370,3.806662,House,Private room,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.5,Real Bed,strict,True,...,41.849684,-87.67627,Pilsen Arts Community Custom Home,Pilsen,17,97.0,https://a0.muscache.com/im/pictures/81318153/a...,60608,1.0,1.0
1,4765892,4.941642,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,2.0,Real Bed,strict,True,...,34.068613,-118.246455,Apartment 5 minutes from DTLA & Dodger Stadium,Echo Park,2,100.0,https://a0.muscache.com/im/pictures/aa00250e-0...,90012,1.0,1.0
2,21169968,4.941642,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.701958,-73.917352,"Brand New Huge 2bdr apartment(L,M train 2 min)",Bushwick,25,88.0,https://a0.muscache.com/im/pictures/d9220535-c...,11237,2.0,3.0
3,7939196,4.867534,Apartment,Entire home/apt,"{""Cable TV"",Internet,""Wireless Internet"",""Air ...",6,1.0,Real Bed,strict,True,...,40.742959,-73.99082,Grande Super Large APT !!!,Flatiron District,12,82.0,,10010,1.0,3.0
4,18161036,3.663562,House,Private room,"{Internet,""Wireless Internet"",""Air conditionin...",2,1.0,Real Bed,flexible,True,...,34.046473,-117.734095,Private Cozy and Clean Rooms in Pomona,,2,100.0,https://a0.muscache.com/im/pictures/e0c9b2f9-a...,91766,1.0,1.0


In [6]:
# print(df_train.isnull().sum())
# no blanks

# Feature Engineering 

#### Modify Columns names 

In [7]:
df_train[(df_train['zipcode'] == '95202\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\n94158')]

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
5890,18915873,4.65396,Apartment,Private room,"{Internet,""Wireless Internet"",""Air conditionin...",1,1.0,Real Bed,flexible,True,...,37.773742,-122.391503,San Francisco Luxury,Mission Bay,5,95.0,https://a0.muscache.com/im/pictures/13f8dcaa-e...,95202\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\n\r\r\r\r\...,1.0,1.0


In [8]:
df_train[(df_train['zipcode'] == ' ')]

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
50482,19864701,4.317488,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",4,1.0,Real Bed,moderate,True,...,38.936838,-76.993548,Stylish Midrise Steps from Metro,,25,88.0,https://a0.muscache.com/im/pictures/2f7dbb0b-a...,,1.0,1.0


In [9]:
print(df_train[(df_train['zipcode'] == '95202\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\n94158')]['zipcode'])
print(df_train[(df_train['zipcode'] == ' ')]['zipcode'])
# print(df_train[(df_train.id == 18915873)]['zipcode'])

#overwrite '95202\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\n94158'
# Ref.: https://www.dataquest.io/blog/settingwithcopywarning/
df_train.loc[df_train.id == 18915873,'zipcode'] = '95202_94158'
df_train.loc[df_train.id == 19864701,'zipcode'] = 'blank_over'

print(df_train[(df_train.id == 18915873)]['zipcode'])
print(df_train[(df_train.id == 19864701)]['zipcode'])

5890    95202\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\n\r\r\r\r\...
Name: zipcode, dtype: object
50482     
Name: zipcode, dtype: object
5890    95202_94158
Name: zipcode, dtype: object
50482    blank_over
Name: zipcode, dtype: object


In [10]:
# df_train['zipcode'] = str(df_train['zipcode'])

In [11]:
# Ref.: https://github.com/arybressane/CEBD1260-BIG-DATA-ANALYTICS

# zipcode to numerical columns
df_zipcode_1 = pd.get_dummies(df_train['zipcode'])
df = pd.concat([df_train, df_zipcode_1], axis=1).fillna(0.0)

print(list(df_zipcode_1.columns))
print('')
print(list(df.columns))

['02108', '02109', '02110', '02111', '02113', '02114', '02115', '02116', '02118', '02119', '02120', '02121', '02122', '02124', '02125', '02126', '02127', '02128', '02129', '02130', '02131', '02132', '02134', '02135', '02136', '02138', '02139', '02145', '02152', '02186', '02199', '02210', '02215', '02445', '02446', '02467', '07306', '10000', '10001', '10001.0', '10002', '10002.0', '10003', '10003-8623', '10003.0', '10004', '10004.0', '10005', '10006', '10007', '10009', '10009.0', '10010', '10010.0', '10011', '10011.0', '10012', '10012.0', '10013', '10013.0', '10014', '10014.0', '10016', '10017', '10018', '10018.0', '10019', '10019.0', '10021', '10022', '10023', '10024', '10025', '10026', '10026.0', '10027', '10027.0', '10028', '10029', '10029.0', '10030', '10031', '10032', '10033', '10034', '10035', '10035.0', '10036', '10036.0', '10037', '10037.0', '10038', '10038.0', '10039', '10040', '10044', '10048', '10065', '10069', '10075', '10106', '10118', '10119.0', '10128', '10128.0', '10129'

In [12]:
df_zipcode_1.dtypes

02108          uint8
02109          uint8
02110          uint8
02111          uint8
02113          uint8
02114          uint8
02115          uint8
02116          uint8
02118          uint8
02119          uint8
02120          uint8
02121          uint8
02122          uint8
02124          uint8
02125          uint8
02126          uint8
02127          uint8
02128          uint8
02129          uint8
02130          uint8
02131          uint8
02132          uint8
02134          uint8
02135          uint8
02136          uint8
02138          uint8
02139          uint8
02145          uint8
02152          uint8
02186          uint8
               ...  
94108          uint8
94109          uint8
94109.0        uint8
94110          uint8
94111          uint8
94112          uint8
94114          uint8
94114.0        uint8
94115          uint8
94115.0        uint8
94116          uint8
94117          uint8
94117.0        uint8
94118          uint8
94118.0        uint8
94121          uint8
94122        

In [13]:
df.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,94127,94129,94130,94131,94132,94133,94134,94158,95202_94158,blank_over
0,13662370,3.806662,House,Private room,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.5,Real Bed,strict,True,...,0,0,0,0,0,0,0,0,0,0
1,4765892,4.941642,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,2.0,Real Bed,strict,True,...,0,0,0,0,0,0,0,0,0,0
2,21169968,4.941642,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,0,0,0,0,0,0,0,0,0,0
3,7939196,4.867534,Apartment,Entire home/apt,"{""Cable TV"",Internet,""Wireless Internet"",""Air ...",6,1.0,Real Bed,strict,True,...,0,0,0,0,0,0,0,0,0,0
4,18161036,3.663562,House,Private room,"{Internet,""Wireless Internet"",""Air conditionin...",2,1.0,Real Bed,flexible,True,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Ref.: https://github.com/arybressane/CEBD1260-BIG-DATA-ANALYTICS
# select the columns
X_columns = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'review_scores_rating'] + list(df_zipcode_1.columns)
y_column = ['log_price']
print(X_columns)

['accommodates', 'bathrooms', 'bedrooms', 'beds', 'review_scores_rating', '02108', '02109', '02110', '02111', '02113', '02114', '02115', '02116', '02118', '02119', '02120', '02121', '02122', '02124', '02125', '02126', '02127', '02128', '02129', '02130', '02131', '02132', '02134', '02135', '02136', '02138', '02139', '02145', '02152', '02186', '02199', '02210', '02215', '02445', '02446', '02467', '07306', '10000', '10001', '10001.0', '10002', '10002.0', '10003', '10003-8623', '10003.0', '10004', '10004.0', '10005', '10006', '10007', '10009', '10009.0', '10010', '10010.0', '10011', '10011.0', '10012', '10012.0', '10013', '10013.0', '10014', '10014.0', '10016', '10017', '10018', '10018.0', '10019', '10019.0', '10021', '10022', '10023', '10024', '10025', '10026', '10026.0', '10027', '10027.0', '10028', '10029', '10029.0', '10030', '10031', '10032', '10033', '10034', '10035', '10035.0', '10036', '10036.0', '10037', '10037.0', '10038', '10038.0', '10039', '10040', '10044', '10048', '10065', '

# Model Training

In [15]:
# threshold = 0.8 
# 80% for train, 20% for test

threshold = 1

X = df[X_columns]
y = df[y_column]

X_train, Xu_test, y_train, yu_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape) #df_train, X_train
print('y_train', y_train.shape) #df_train, y_train
print('Xu_test', Xu_test.shape) # unreal X_test
print('yu_test', yu_test.shape) # unreal y_test

X_train (51000, 737)
y_train (51000, 1)
Xu_test (0, 737)
yu_test (0, 1)


In [16]:
df_zipcode_2 = pd.get_dummies(df_test['zipcode'])
df = pd.concat([df_train, df_zipcode_2], axis=1).fillna(0.0)
X_columns = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'review_scores_rating'] + list(df_zipcode_2.columns)
y_column = ['log_price']

threshold = 1

X = df[X_columns]
y = df[y_column]

X_test, Xu_train, y_test, yu_train = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_test', X_test.shape) #df_test, X_test
print('y_test', y_test.shape) #df_test, y_test
print('Xu_train', Xu_train.shape) # unreal X_train
print('yu_train', yu_train.shape) # unreal y_train

X_test (51000, 673)
y_test (51000, 1)
Xu_train (0, 673)
yu_train (0, 1)


# Missing values

In [17]:
# handle missing values

# probably not a good idea for 'review_scores_rating' 
# as it fill empty reviews with 0 when the actual review score is unknown
df_train = df_train.fillna(0.0) 

print(df_train.shape)

(51000, 29)


# Experiments

In [18]:
# Ref.: https://github.com/arybressane/CEBD1260-BIG-DATA-ANALYTICS/blob/master/week5/regression-house-sklearn-ext-class-version.ipynb

def model_training(model_name, model, X_train, y_train):
    model.fit(X_train, y_train)
    return model
    
def model_prediction(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

def model_evaluation(model_name, y_test, y_pred):
    print(model_name)
    print('MAE', mean_absolute_error(y_test, y_pred))
    print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred)))
    plt.scatter(y_test, y_pred, alpha=0.3)
    plt.plot(range(0,5000000, 100), range(0,5000000, 100), '--r', alpha=0.3, label='Line1')
    plt.title(model_name)
    plt.xlabel('True Value')
    plt.ylabel('Predict Value')
    plt.xlim([0, 5000000])
    plt.ylim([0, 5000000])
    plt.show()
    print('')

def run_experiment(model_name, model, X_train, y_train, X_test):
    train_model = model_training(model_name, model, X_train, y_train)
    predictions = model_prediction(train_model, X_test)
    model_evaluation(model_name, y_test, predictions)
    
run_experiment('Linear Regression', LinearRegression(), X_train, y_train, X_test)
run_experiment('KNN 5', KNeighborsRegressor(5), X_train, y_train, X_test)
run_experiment('KNN 2', KNeighborsRegressor(2), X_train, y_train, X_test)
run_experiment('Decision Tree', DecisionTreeRegressor(), X_train, y_train, X_test)
run_experiment('Random Forest 10', RandomForestRegressor(10), X_train, y_train, X_test)
run_experiment('Random Forest 100', RandomForestRegressor(100), X_train, y_train, X_test)



ValueError: shapes (51000,673) and (737,1) not aligned: 673 (dim 1) != 737 (dim 0)

# Error Analysis

In [None]:
# Ref.: https://github.com/arybressane/CEBD1260-BIG-DATA-ANALYTICS/blob/master/week5/regression-house-sklearn-ext-class-version.ipynb

model = RandomForestRegressor(100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
#for i in range(len(X_test.columns)):
fi = []
for i, col in enumerate(X_test.columns):
    fi.append([col, model.feature_importances_[i]])
pd.DataFrame(fi).sort_values(1, ascending=False)

In [None]:
df_test = pd.DataFrame(X_test).copy()
df_test['price'] = y_test
df_test['prediction'] = y_pred
df_test['abs_error'] = abs(df_test['price']-df_test['prediction'])
df_test.sort_values(by='abs_error', ascending=False).round()

In [None]:
plt.hist(df_test['abs_error'], bins=30)
plt.show()

In [None]:
df_error = df_test[df_test['abs_error']>500000]
df_error

In [None]:
df_error.describe()

In [None]:
df_error.corr()['abs_error'].dropna().sort_values()

# Cross Validation

In [None]:
# Ref.: https://github.com/arybressane/CEBD1260-BIG-DATA-ANALYTICS

In [None]:
models = [
    ('LinearRegression', LinearRegression()),
    ('RandomForestRegressor10', RandomForestRegressor(n_estimators=10)),
    ('RandomForestRegressor100', RandomForestRegressor(n_estimators=100, n_jobs=4)),
    ('KNeighborsRegressor', KNeighborsRegressor()),
    ('DecisionTreeRegressor', DecisionTreeRegressor())
]

k = 10
results = {}
for m in models:
     print('MODEL', m[0])
     results[m[0]] = {'mae':[], 'rmse':[]}
     kf = KFold(n_splits=k)
     for train_index, test_index in kf.split(X):
         X_train_k, X_test_k = X.values[train_index], X.values[test_index]
         y_train_k, y_test_k = y.values[train_index], y.values[test_index]
        
         model = m[1]
         model.fit(X_train_k, y_train_k.ravel())
         y_pred = model.predict(X_test_k)
            
         mae = mean_absolute_error(y_test_k, y_pred)
         rmse = np.sqrt(mean_squared_error(y_test_k, y_pred))
            
         results[m[0]]['mae'].append(mae)
         results[m[0]]['rmse'].append(rmse)
            

In [None]:
for metric in ['mae', 'rmse']:
     values = []
     labels = []
     for model, result_values in results.items():
         for m, v in result_values.items():
             if m == metric:
                 labels.append(model)
                 values.append(v)
                    
     plt.figure(figsize=(12,6))
     plt.title(metric)
     plt.boxplot(values)
     plt.xticks(range(1, len(labels)+1), labels, rotation='horizontal')
     plt.show()
    

# Prepare submission

In [None]:
df_prediction = df_test[X_columns].fillna(0.0)
df_test['log_price'] = model.predict(df_prediction)
df_test[['id', 'log_price']]

In [None]:
df_test[['id', 'log_price']].to_csv('submission_linear_regression_modified.csv', index=False)