In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
%matplotlib inline

X = pd.read_csv('model_X.csv')
X.head()

Unnamed: 0,host_is_superhost,host_total_listings_count,accommodates,bathrooms,bedrooms,beds,guests_included,minimum_nights,maximum_nights,review_scores_rating,...,property_type_Townhouse,property_type_Treehouse,property_type_Villa,room_type_Private room,room_type_Shared room,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed,annual_booked
0,0.0,6.0,2,1.0,1.0,1.0,1,1,730,98.0,...,0,0,0,1,0,0,0,0,1,5.0
1,0.0,5.0,2,1.0,0.0,1.0,2,1,1125,95.0,...,0,0,0,0,0,0,0,0,1,81.0
2,0.0,1.0,2,1.0,1.0,1.0,2,3,7,93.939368,...,0,0,0,1,0,0,0,1,0,0.0
3,0.0,1.0,3,1.0,1.0,4.0,1,1,730,90.0,...,0,0,0,0,0,0,0,0,1,299.0
4,0.0,1.0,2,1.0,1.0,2.0,1,4,90,89.0,...,0,0,0,0,0,0,0,0,1,104.0


In [34]:
y = pd.read_csv('model_y.csv', header=None, squeeze=True)
y.head()

0     2.041096
1    49.931507
2     0.000000
3    72.906849
4    29.917808
Name: 0, dtype: float64

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [49]:
# Function to run Lasso model

def runLasso(alpha=1.0):
    # Instantiate & train
    lasso_reg = Lasso(alpha=alpha)
    lasso_reg.fit(X_train, y_train)

    # Predict testing data
    pred_train = lasso_reg.predict(X_train)
    pred_test = lasso_reg.predict(X_test)

    # Score
    train_scoreL = lasso_reg.score(X_train,y_train)
    test_scoreL = lasso_reg.score(X_test,y_test)
    coeff_used = np.sum(lasso_reg.coef_!=0)
    
    print("Lasso Score (" + str(alpha) + "):")
    print(train_scoreL)
    print(test_scoreL)
    print('-------')
    print("Coefficients Used:")
    print(coeff_used)

runLasso()

Lasso Score (1.0):
0.23223211976073035
0.18862779654313222
-------
Coefficients Used:
20


In [51]:
alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]

for i in range(10):
    runLasso(alpha_ridge[i])



Lasso Score (1e-15):
0.2559366327386795
0.20189974847694037
-------
Coefficients Used:
279




Lasso Score (1e-10):
0.2559366327390432
0.20189974871655747
-------
Coefficients Used:
279




Lasso Score (1e-08):
0.2559366327750352
0.20189977241934653
-------
Coefficients Used:
279




Lasso Score (0.0001):
0.25593536548472673
0.2019226658696961
-------
Coefficients Used:
276




Lasso Score (0.001):
0.25587513878367596
0.20203614035738615
-------
Coefficients Used:
241
Lasso Score (0.01):
0.25495352910683877
0.20238889968783336
-------
Coefficients Used:
124
Lasso Score (1):
0.23223211976073035
0.18862779654313222
-------
Coefficients Used:
20
Lasso Score (5):
0.20547594338644537
0.16504046703963937
-------
Coefficients Used:
12
Lasso Score (10):
0.19236671314805076
0.15451447757549708
-------
Coefficients Used:
8
Lasso Score (20):
0.18615436751503778
0.14947038592763684
-------
Coefficients Used:
8


In [None]:
# The Lasso linear model does not seem to surpass the simple Linear Regression model trained (see "Airbnb NYC Data Exploration" notebook for details)
# Therefore, we will discount this as a superior modelling assumption

In [43]:
# Function to run SGD model

def runSGD():
    # Instantiate & train
    sgd_reg = SGDRegressor(loss="squared_loss", penalty=None)
    sgd_reg.fit(X_train, y_train)

    # Predict testing data
    pred_train = sgd_reg.predict(X_train)
    pred_test = sgd_reg.predict(X_test)

    # Score
    train_score = sgd_reg.score(X_train,y_train)
    test_score = sgd_reg.score(X_test,y_test)
    coeff_used = np.sum(sgd_reg.coef_!=0)
    
    print("SGD Score:")
    print(train_score)
    print(test_score)
    print('-------')
    print("Coefficients Used:")
    print(coeff_used)

runSGD()



SGD Score:
-1.4613078888376525e+45
-5.580117307357575e+36
-------
Coefficients Used:
279


In [47]:
# Function to run Ridge model

def runRidge(alpha=1.0):
    # Instantiate & train
    rid_reg = Ridge(alpha=alpha, normalize=True)
    rid_reg.fit(X_train, y_train)

    # Predict testing data
    pred_train = rid_reg.predict(X_train)
    pred_test = rid_reg.predict(X_test)

    # Score
    train_score = rid_reg.score(X_train,y_train)
    test_score = rid_reg.score(X_test,y_test)
    coeff_used = np.sum(rid_reg.coef_!=0)
    
    print("SGD Score (" + str(alpha) + "):")
    print(train_score)
    print(test_score)
    print('-------')
    print("Coefficients Used:")
    print(coeff_used)

runRidge()

SGD Score (1.0):
0.20902323915862486
0.16550505004900973
-------
Coefficients Used:
279


In [48]:
alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]

for i in range(10):
    runRidge(alpha_ridge[i])

  overwrite_a=True).T


SGD Score (1e-15):
0.2559366870637817
0.20189484648856693
-------
Coefficients Used:
279
SGD Score (1e-10):
0.2559366870637817
0.20190767294147383
-------
Coefficients Used:
279
SGD Score (1e-08):
0.2559366870637807
0.20190767275041943
-------
Coefficients Used:
279
SGD Score (0.0001):
0.25593659625822296
0.20190549320829965
-------
Coefficients Used:
279
SGD Score (0.001):
0.2559322297145006
0.20188920584965975
-------
Coefficients Used:
279
SGD Score (0.01):
0.255889202023326
0.20184044553149974
-------
Coefficients Used:
279
SGD Score (1):
0.20902323915862486
0.16550505004900973
-------
Coefficients Used:
279
SGD Score (5):
0.11072430763295538
0.08812228834875402
-------
Coefficients Used:
279
SGD Score (10):
0.0704876562860427
0.05617677247752917
-------
Coefficients Used:
279
SGD Score (20):
0.040971903570563684
0.0326704448518893
-------
Coefficients Used:
279


In [54]:
# Function to run Decision Trees

from sklearn.tree import DecisionTreeRegressor

def runTree(max_depth=None, min_samples_leaf=1, min_samples_split=2):
    # Instantiate & train
    tree_reg = DecisionTreeRegressor(criterion='mse', max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    tree_reg.fit(X_train, y_train)

    # Predict testing data
    pred_train = tree_reg.predict(X_train)
    pred_test = tree_reg.predict(X_test)

    # Score
    train_score = tree_reg.score(X_train,y_train)
    test_score = tree_reg.score(X_test,y_test)
    
    print("Tree Score (" + str(max_depth) + ', ' + str(min_samples_leaf) + ', ' + str(min_samples_split) + "):")
    print(train_score)
    print(test_score)
    print('-------')

runTree()

Tree Score (None, 1, 2):
0.9878965346502062
-0.02006085203206731
-------


In [64]:
depths = [2, 5, 6, 7, 8]
for dep in depths:
    runTree(dep)

Tree Score (2, 1, 2):
0.15479934586567123
0.1218326320202835
-------
Tree Score (5, 1, 2):
0.47442948875736324
0.038182650899466486
-------
Tree Score (6, 1, 2):
0.525310295933926
0.15085905556420842
-------
Tree Score (7, 1, 2):
0.5638299452344031
0.17190299312596047
-------
Tree Score (8, 1, 2):
0.6307552187994618
0.009473675913360236
-------


In [66]:
min_leafs = [2, 4, 6, 8, 10, 12, 14, 16]
for lfs in min_leafs:
    runTree(7, lfs)

Tree Score (7, 2, 2):
0.44084600423305964
0.14797983522226277
-------
Tree Score (7, 4, 2):
0.3857547145581244
0.17717393832354023
-------
Tree Score (7, 6, 2):
0.3421063058328305
0.19366673593944272
-------
Tree Score (7, 8, 2):
0.33583295299107185
0.1936338179864182
-------
Tree Score (7, 10, 2):
0.3438241646559581
0.20280465059455755
-------
Tree Score (7, 12, 2):
0.32942818941899776
0.2207759709711551
-------
Tree Score (7, 14, 2):
0.328381537086362
0.22152969537300318
-------
Tree Score (7, 16, 2):
0.32610147391348787
0.22092665015994195
-------


In [62]:
min_splits = [2, 4, 6, 8, 10]
for splt in min_splits:
    runTree(7, 14, splt)

Tree Score (7, 14, 2):
0.328381537086362
0.22152969537300318
-------
Tree Score (7, 14, 4):
0.328381537086362
0.22152969537300318
-------
Tree Score (7, 14, 6):
0.328381537086362
0.22152969537300318
-------
Tree Score (7, 14, 8):
0.3283815370863622
0.22152969537300318
-------
Tree Score (7, 14, 10):
0.328381537086362
0.22152969537300318
-------


In [67]:
for dep in depths:
    for lfs in min_leafs:
        runTree(dep, lfs)

Tree Score (2, 2, 2):
0.15479934586567123
0.12183263202028384
-------
Tree Score (2, 4, 2):
0.15479934586567112
0.12183263202028359
-------
Tree Score (2, 6, 2):
0.15479934586567123
0.12183263202028384
-------
Tree Score (2, 8, 2):
0.15479934586567112
0.12183263202028359
-------
Tree Score (2, 10, 2):
0.15479934586567123
0.12183263202028384
-------
Tree Score (2, 12, 2):
0.15479934586567123
0.12183263202028384
-------
Tree Score (2, 14, 2):
0.15479934586567123
0.12183263202028359
-------
Tree Score (2, 16, 2):
0.15479934586567112
0.12183263202028359
-------
Tree Score (5, 2, 2):
0.36661942537925574
0.11296665882072023
-------
Tree Score (5, 4, 2):
0.31980895350015925
0.17135495073230245
-------
Tree Score (5, 6, 2):
0.2850110739746402
0.17355259300237758
-------
Tree Score (5, 8, 2):
0.2832835369825508
0.17283090183811523
-------
Tree Score (5, 10, 2):
0.2863249780699516
0.19189751655400722
-------
Tree Score (5, 12, 2):
0.276252339447694
0.20328371328514327
-------
Tree Score (5, 14, 

In [68]:
tree_reg = DecisionTreeRegressor(criterion='mse', max_depth=8, min_samples_leaf=16, min_samples_split=2)
tree_reg.fit(X_train, y_train)

# Predict testing data
pred_train = tree_reg.predict(X_train)
pred_test = tree_reg.predict(X_test)

In [69]:
# Import prediction input
df_nei_Manhattan_EV = pd.read_csv('./data/input/pred_input_Manhattan_EV.csv')
df_nei_Manhattan_HA = pd.read_csv('./data/input/pred_input_Manhattan_HA.csv')
df_nei_Manhattan_HK = pd.read_csv('./data/input/pred_input_Manhattan_HK.csv')
df_nei_Manhattan_UWS = pd.read_csv('./data/input/pred_input_Manhattan_UWS.csv')

df_nei_Brooklyn_BS = pd.read_csv('./data/input/pred_input_Brooklyn_BS.csv')
df_nei_Brooklyn_BU = pd.read_csv('./data/input/pred_input_Brooklyn_BU.csv')
df_nei_Brooklyn_WI = pd.read_csv('./data/input/pred_input_Brooklyn_WI.csv')

df_nei_Queens_AS = pd.read_csv('./data/input/pred_input_Queens_AS.csv')
df_nei_Queens_LI = pd.read_csv('./data/input/pred_input_Queens_LI.csv')

avgRev_Manhattan_EV = round(tree_reg.predict(df_nei_Manhattan_EV)[0],2)
avgRev_Manhattan_HA = round(tree_reg.predict(df_nei_Manhattan_HA)[0],2)
avgRev_Manhattan_HK = round(tree_reg.predict(df_nei_Manhattan_HK)[0],2)
avgRev_Manhattan_UWS = round(tree_reg.predict(df_nei_Manhattan_UWS)[0],2)

avgRev_Brooklyn_BS = round(tree_reg.predict(df_nei_Brooklyn_BS)[0],2)
avgRev_Brooklyn_BU = round(tree_reg.predict(df_nei_Brooklyn_BU)[0],2)
avgRev_Brooklyn_WI = round(tree_reg.predict(df_nei_Brooklyn_WI)[0],2)

avgRev_Queens_AS = round(tree_reg.predict(df_nei_Queens_AS)[0],2)
avgRev_Queens_LI = round(tree_reg.predict(df_nei_Queens_LI)[0],2)

print("--------Manhattan---------")
print(avgRev_Manhattan_EV)
print(avgRev_Manhattan_HA)
print(avgRev_Manhattan_HK)
print(avgRev_Manhattan_UWS)
print("")
print("--------Brooklyn---------")
print(avgRev_Brooklyn_BS)
print(avgRev_Brooklyn_BU)
print(avgRev_Brooklyn_WI)
print("")
print("--------Queens---------")
print(avgRev_Queens_AS)
print(avgRev_Queens_LI)

--------Manhattan---------
44.15
44.15
44.15
44.15

--------Brooklyn---------
30.37
30.37
30.37

--------Queens---------
30.37
30.37


In [70]:
# Import prediction input
df_nei_1_1 = pd.read_csv('./data/input/pred_input_Manhattan_EV.csv')
df_nei_2_2 = pd.read_csv('./data/input/pred_input_Manhattan_EV_2bed_2_bath.csv')
df_nei_2_1 = pd.read_csv('./data/input/pred_input_Manhattan_EV_2bed_1_bath.csv')

avgRev_1_1 = tree_reg.predict(df_nei_1_1)[0]
avgRev_2_2 = tree_reg.predict(df_nei_2_2)[0]
avgRev_2_1 = tree_reg.predict(df_nei_2_1)[0]

print(round(avgRev_1_1,2))
print(round(avgRev_2_2,2))
print(round(avgRev_2_1,2))

44.15
44.15
44.15
