In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_columns', None)

## I. Load related data

In [2]:
listings = pd.read_csv("data/listings.csv.gz")
reviews = pd.read_csv("data/reviews.csv.gz")

## II. Data inspections

In [3]:
long_content_cols = ["description", "neighborhood_overview", "picture_url", 
                     "host_url", "host_about", "host_thumbnail_url", "host_picture_url"]
listings.drop(long_content_cols, axis=1).head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,5570,https://www.airbnb.com/rooms/5570,20201223070513,2020-12-23,Ocean front condo on the sand,8435,Jef Karchin,2009-02-23,"San Diego, California, United States",within a few hours,100%,100%,f,Mission Beach,3.0,3.0,"['email', 'phone', 'kba']",t,t,"San Diego, California, United States",Mission Bay,,32.7843,-117.25258,Entire condominium,Entire home/apt,6,,1 bath,2.0,4.0,"[""Iron"", ""Elevator"", ""Stove"", ""Microwave"", ""Di...","$2,050.00",3,365,3.0,3.0,365.0,365.0,3.0,365.0,,t,6,21,44,207,2020-12-23,0,0,0,,,,,,,,,,,f,3,3,0,0,
1,29967,https://www.airbnb.com/rooms/29967,20201223070513,2020-12-23,"Great home, 10 min walk to Beach",129123,Michael,2010-05-21,"San Diego, California, United States",within an hour,100%,50%,t,Pacific Beach,6.0,6.0,"['email', 'phone', 'reviews', 'kba', 'work_ema...",t,t,,Pacific Beach,,32.80724,-117.2563,Entire bungalow,Entire home/apt,8,,2 baths,2.0,3.0,"[""Iron"", ""Hot water"", ""Heating"", ""Kitchen"", ""W...",$261.00,4,365,4.0,4.0,365.0,365.0,4.0,365.0,,t,6,6,6,213,2020-12-23,62,2,1,2010-07-09,2020-11-29,98.0,10.0,10.0,10.0,10.0,10.0,10.0,,f,5,5,0,0,0.49
2,38245,https://www.airbnb.com/rooms/38245,20201223070513,2020-12-23,Point Loma: Den downstairs,164137,Melinda,2010-07-12,"San Diego, California, United States",within a day,100%,86%,f,Loma Portal,3.0,3.0,"['email', 'phone', 'reviews', 'kba']",t,t,"San Diego, California, United States",Roseville,,32.74202,-117.2187,Private room in house,Private room,1,,1 shared bath,1.0,1.0,"[""Iron"", ""Hot water"", ""Dishes and silverware"",...",$74.00,1,21,1.0,1.0,21.0,21.0,1.0,21.0,,t,24,54,84,359,2020-12-23,143,0,0,2010-09-09,2019-10-20,86.0,9.0,9.0,10.0,10.0,9.0,9.0,,f,3,0,3,0,1.14
3,54001,https://www.airbnb.com/rooms/54001,20201223070513,2020-12-23,"La Jolla 2 Bdr Cottage: 1 Qn; 2Twns, Blks 2 Ocn",252692,Marsha,2010-10-04,"San Diego, California, United States",within an hour,100%,75%,t,La Jolla,5.0,5.0,"['email', 'phone', 'reviews', 'kba']",t,t,"La Jolla, California, United States",La Jolla,,32.81301,-117.26856,Entire guesthouse,Entire home/apt,3,,1 bath,2.0,5.0,"[""Luggage dropoff allowed"", ""Single level home...",$110.00,3,365,3.0,30.0,18.0,1125.0,23.0,1079.9,,t,0,0,18,99,2020-12-23,244,30,5,2011-01-04,2020-12-22,99.0,10.0,10.0,10.0,10.0,10.0,10.0,,f,2,1,1,0,2.01
4,62274,https://www.airbnb.com/rooms/62274,20201223070513,2020-12-23,"charming, colorful, close to beach",302986,Isabel,2010-11-28,"San Diego, California, United States",within an hour,92%,95%,t,Pacific Beach,2.0,2.0,"['email', 'phone', 'reviews', 'kba']",t,t,"San Diego, California, United States",Pacific Beach,,32.80734,-117.24243,Entire guesthouse,Entire home/apt,2,,1 bath,1.0,1.0,"[""Luggage dropoff allowed"", ""Iron"", ""Hot water...",$74.00,1,30,1.0,1.0,1125.0,1125.0,1.0,1125.0,,t,4,33,63,320,2020-12-23,620,77,0,2010-12-05,2020-11-14,96.0,10.0,10.0,10.0,10.0,10.0,10.0,,f,2,2,0,0,5.07


## III. Data pre-processing

### III.1 get rid of columns with no information

In [4]:
exclude_list = [] 
for col in listings.columns:
    value_cnts = len(listings[col].unique())
    if value_cnts < 2:
        exclude_list.append(col)

# too many null values 
is_null_cnt = listings.isnull().sum()
exclude_list += list(is_null_cnt[is_null_cnt > listings.shape[0]/4].index)
exclude_list = list(set(exclude_list))
print(exclude_list)


# no for analysis
exclude_list += ["scrape_id", "host_id"]

# get rid of columns
listings = listings.drop(exclude_list ,axis=1)

['license', 'calendar_updated', 'neighborhood_overview', 'scrape_id', 'bathrooms', 'host_about', 'neighbourhood', 'neighbourhood_group_cleansed']


### III.2 seperate numeric and non-numeric data and process with different methods

In [5]:
df_numeric = listings.select_dtypes(exclude=['object'])

# via data inspections
useful_cat_varibles = ["neighbourhood_cleansed", "host_identity_verified", "host_neighbourhood"]
useful_cat_varibles += ["property_type", "room_type"]
useful_cat_varibles += ["host_response_time"]
df_cat = listings[useful_cat_varibles]

description_cols = ["description", "host_about"]
ser_amenities = listings.amenities

# format parsing
Y = listings.price.str.replace("$","").str.replace(",","").astype(float)


# encode category variable into numeric one
le_dict = {}
categorical_feature = []
for col in df_cat.columns:
    le = preprocessing.LabelEncoder()
    le.fit(df_cat[col])
    le_dict[col] = le
    encoded_fts = le.transform(df_cat[col])
    df_cat[col] = encoded_fts
    categorical_feature.append(col)

df_all = pd.concat([df_numeric, df_cat], axis=1)
df_all["n_amenities"] = ser_amenities.apply(lambda x: len(eval(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### III.3 missing data imputation

In [6]:
# show all varibles with missing data
varibles_na_cnt = df_all.isnull().sum()
print(varibles_na_cnt[varibles_na_cnt > 0])

host_listings_count               3
host_total_listings_count         3
bedrooms                        997
beds                             50
minimum_minimum_nights            1
maximum_minimum_nights            1
minimum_maximum_nights            1
maximum_maximum_nights            1
minimum_nights_avg_ntm            1
maximum_nights_avg_ntm            1
review_scores_rating           1482
review_scores_accuracy         1525
review_scores_cleanliness      1524
review_scores_checkin          1527
review_scores_communication    1525
review_scores_location         1527
review_scores_value            1527
reviews_per_month              1441
dtype: int64


In [8]:
imputer_dict = {}
for col in varibles_na_cnt[varibles_na_cnt > 0].index:
    if "review" in col:
        value_to_fill = 0
    else:
        most_freq = df_all[col].value_counts().index[0]
        value_to_fill = most_freq    
    df_all[col].fillna(value_to_fill, inplace=True)
    imputer_dict[col] = value_to_fill

## IV. Split training and testing datasets

In [21]:
X = df_all.drop(["id"],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=9527)

In [23]:
X_train.head()

Unnamed: 0,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,neighbourhood_cleansed,host_identity_verified,host_neighbourhood,property_type,room_type,host_response_time,n_amenities
8358,11.0,11.0,32.74679,-117.1621,2,1.0,1.0,7,1125,3.0,7.0,1125.0,1125.0,6.9,1125.0,0,15,45,54,5,5,0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,9,9,0,0,2.03,51,1,90,6,0,3,19
8832,2.0,2.0,32.84143,-117.2724,5,2.0,3.0,2,1125,2.0,2.0,1125.0,1125.0,2.0,1125.0,6,7,7,274,1,1,1,100.0,10.0,10.0,10.0,10.0,10.0,10.0,3,3,0,0,1.0,41,1,166,10,0,2,41
3120,35.0,35.0,32.72371,-117.16659,4,1.0,2.0,1,365,2.0,2.0,1125.0,1125.0,2.0,1125.0,17,45,68,107,133,31,1,87.0,9.0,8.0,8.0,9.0,10.0,8.0,30,30,0,0,3.32,46,1,118,15,0,3,24
1081,1.0,1.0,32.80372,-117.24097,2,1.0,1.0,1,14,1.0,2.0,14.0,14.0,1.3,14.0,30,60,67,67,82,13,0,95.0,10.0,10.0,10.0,10.0,10.0,9.0,1,0,1,0,1.28,69,1,166,30,2,2,51
2566,10.0,10.0,32.73275,-117.1665,10,3.0,4.0,2,21,2.0,2.0,21.0,21.0,2.0,21.0,25,55,85,349,106,16,0,98.0,10.0,10.0,10.0,10.0,10.0,9.0,2,1,0,1,2.53,51,1,34,20,0,3,39


## V. Model traing and Evaluation

In [40]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=100)

Starting training...
You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's l1: 241.722	valid_0's l2: 770116
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 235.902	valid_0's l2: 745134
[3]	valid_0's l1: 230.319	valid_0's l2: 723497
[4]	valid_0's l1: 225.372	valid_0's l2: 705008
[5]	valid_0's l1: 220.577	valid_0's l2: 689339
[6]	valid_0's l1: 215.814	valid_0's l2: 675576
[7]	valid_0's l1: 211.651	valid_0's l2: 664995
[8]	valid_0's l1: 207.985	valid_0's l2: 655955
[9]	valid_0's l1: 204.605	valid_0's l2: 648128
[10]	valid_0's l1: 201.081	valid_0's l2: 641127
[11]	valid_0's l1: 197.381	valid_0's l2: 631435
[12]	valid_0's l1: 194.048	valid_0's l2: 622668
[13]	valid_0's l1: 190.819	valid_0's l2: 614816
[14]	valid_0's l1: 186.9	valid_0's l2: 615057
[15]	valid_0's l1: 184.156	valid_0's l2: 607696
[16]	valid_0's l1: 181.609	valid_0's l2: 597784
[17]	valid_0's l1: 179.475	valid_0's l2: 588617
[18]	valid_0's l1: 177.358	valid_0's l2: 580309
[19]	v

[199]	valid_0's l1: 151.663	valid_0's l2: 212176
[200]	valid_0's l1: 151.615	valid_0's l2: 211503
[201]	valid_0's l1: 151.736	valid_0's l2: 210417
[202]	valid_0's l1: 151.807	valid_0's l2: 209243
[203]	valid_0's l1: 151.902	valid_0's l2: 208180
[204]	valid_0's l1: 151.946	valid_0's l2: 207319
[205]	valid_0's l1: 151.971	valid_0's l2: 206606
[206]	valid_0's l1: 152.135	valid_0's l2: 206855
[207]	valid_0's l1: 152.154	valid_0's l2: 207065
[208]	valid_0's l1: 152.226	valid_0's l2: 207296
[209]	valid_0's l1: 152.43	valid_0's l2: 207550
[210]	valid_0's l1: 152.581	valid_0's l2: 207890
[211]	valid_0's l1: 152.519	valid_0's l2: 206852
[212]	valid_0's l1: 152.454	valid_0's l2: 205824
[213]	valid_0's l1: 152.507	valid_0's l2: 205090
[214]	valid_0's l1: 152.473	valid_0's l2: 204311
[215]	valid_0's l1: 152.559	valid_0's l2: 203610
[216]	valid_0's l1: 152.555	valid_0's l2: 203198
[217]	valid_0's l1: 152.688	valid_0's l2: 202994
[218]	valid_0's l1: 152.691	valid_0's l2: 202733
[219]	valid_0's l1: 1

[389]	valid_0's l1: 148.549	valid_0's l2: 158197
[390]	valid_0's l1: 148.554	valid_0's l2: 158156
[391]	valid_0's l1: 148.536	valid_0's l2: 158166
[392]	valid_0's l1: 148.589	valid_0's l2: 158262
[393]	valid_0's l1: 148.595	valid_0's l2: 158224
[394]	valid_0's l1: 148.631	valid_0's l2: 158358
[395]	valid_0's l1: 148.559	valid_0's l2: 158224
[396]	valid_0's l1: 148.565	valid_0's l2: 157936
[397]	valid_0's l1: 148.611	valid_0's l2: 157638
[398]	valid_0's l1: 148.61	valid_0's l2: 157303
[399]	valid_0's l1: 148.765	valid_0's l2: 157103
[400]	valid_0's l1: 148.809	valid_0's l2: 156877
[401]	valid_0's l1: 148.855	valid_0's l2: 156794
[402]	valid_0's l1: 148.911	valid_0's l2: 156738
[403]	valid_0's l1: 148.889	valid_0's l2: 156602
[404]	valid_0's l1: 148.859	valid_0's l2: 156520
[405]	valid_0's l1: 148.834	valid_0's l2: 156403
[406]	valid_0's l1: 148.957	valid_0's l2: 156427
[407]	valid_0's l1: 149.064	valid_0's l2: 156453
[408]	valid_0's l1: 149.09	valid_0's l2: 156260
[409]	valid_0's l1: 14

[619]	valid_0's l1: 148.497	valid_0's l2: 143031
[620]	valid_0's l1: 148.539	valid_0's l2: 143011
[621]	valid_0's l1: 148.489	valid_0's l2: 142921
[622]	valid_0's l1: 148.423	valid_0's l2: 142767
[623]	valid_0's l1: 148.483	valid_0's l2: 142713
[624]	valid_0's l1: 148.55	valid_0's l2: 142701
[625]	valid_0's l1: 148.576	valid_0's l2: 142656
[626]	valid_0's l1: 148.583	valid_0's l2: 142669
[627]	valid_0's l1: 148.607	valid_0's l2: 142669
[628]	valid_0's l1: 148.631	valid_0's l2: 142674
[629]	valid_0's l1: 148.64	valid_0's l2: 142616
[630]	valid_0's l1: 148.639	valid_0's l2: 142595
[631]	valid_0's l1: 148.65	valid_0's l2: 142567
[632]	valid_0's l1: 148.625	valid_0's l2: 142555
[633]	valid_0's l1: 148.63	valid_0's l2: 142559
[634]	valid_0's l1: 148.642	valid_0's l2: 142578
[635]	valid_0's l1: 148.635	valid_0's l2: 142572
[636]	valid_0's l1: 148.641	valid_0's l2: 142492
[637]	valid_0's l1: 148.607	valid_0's l2: 142393
[638]	valid_0's l1: 148.608	valid_0's l2: 142309
[639]	valid_0's l1: 148.

In [42]:
print('Saving model...')
gbm.save_model('model.txt')
print('Starting predicting...')
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Saving model...
Starting predicting...
The rmse of prediction is: 380.5928602298374


[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]

In [None]:
# II. hyper-parameter tunning:
## 9. CV/ grid, random search (build in k-fold)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score

reg = RandomForestRegressor()
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

random_grid = {'max_features': max_features,
               'min_samples_leaf': min_samples_leaf}

mse_scorer = make_scorer(mean_squared_error)

grid_obj = GridSearchCV(reg, random_grid, scoring=mse_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
final_reg = grid_obj.best_estimator_
print(grid_obj.best_params_)
final_reg.fit(X_train, y_train)


In [None]:
y_pred = final_reg.predict(X_test)
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

In [76]:
reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,29967,62788,2010-07-09,151260,Debbie,When I booked our stay in San Diego at Dennis ...
1,29967,64568,2010-07-14,141552,Eric,This was my first experience with using airbnb...
2,29967,67502,2010-07-22,141591,David,We found the house to be very accommodating--e...
3,29967,70466,2010-07-29,125982,Anders,As advertised and more. Dennis was very helpfu...
4,29967,74876,2010-08-07,29835,Miyoko,We had a great time in San Diego. Denis' house...
...,...,...,...,...,...,...
443411,46995597,715948454,2020-12-16,346790349,Mike,"This is a great place with modern design, extr..."
443412,46995597,716097352,2020-12-17,370055591,Hanna,Such a amazing place! It makes me feel like ho...
443413,46995597,716330508,2020-12-18,299951196,Trina,Booked this place last minute and was able to ...
443414,46995597,716579344,2020-12-19,74213915,Colby,We had a great time staying at this house. It ...


In [75]:
reviews[reviews.listing_id == 29967].comments

0     When I booked our stay in San Diego at Dennis ...
1     This was my first experience with using airbnb...
2     We found the house to be very accommodating--e...
3     As advertised and more. Dennis was very helpfu...
4     We had a great time in San Diego. Denis' house...
                            ...                        
57    Michael’s place is great, very good location c...
58    Michaels home is spacious and clean. Was the p...
59    A very good neighborhood and house. The landlo...
60    Michael/Dennies were great hosts! Responded ve...
61          Great Location and nice for a small family.
Name: comments, Length: 62, dtype: object

In [None]:
review_counts
review_senmentic_counts
