## Importing libraries

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
import seaborn as sns
import re
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
%matplotlib inline

## Extracting Seatle's data

In [2]:
S_l_data = pd.read_csv('Seatle_listings.csv')

## Cleaning the listings data

In [3]:
# Removing url features
col_url = []
for col in S_l_data.columns.tolist():
    if '_url' in col:
        col_url.append(col)

S_l_dropped2 = S_l_data.drop(col_url, axis=1)

**I am removing some features that obviously lead to nothing, such as #id and 'name'.**

**I have to explain some of the removals, such as 'street', 'zipcode', 'host_listings_count', 'latitude', 'neighbourhood_cleansed', neighbourhood_group_cleansed**

In [4]:
S_l_dropped3 = S_l_dropped2.drop(['#id','name','host_id','host_name','host_id','smart_location','state', 
                                  'host_listings_count','street', 'host_location', 'neighbourhood',
                                  'host_neighbourhood', 'neighbourhood_group_cleansed','neighbourhood_cleansed','city'],axis=1)

**I also need to transform the dates that are in string format to date format**

In [5]:
r = re.compile('.{4}-.{2}-.{2}')

for col in S_l_dropped3.columns.tolist():
    if S_l_dropped3[col].dtypes == 'O':
        if r.match(S_l_dropped3[col].any()):
            S_l_dropped3[col] = pd.to_datetime(S_l_dropped3[col], errors='ignore')
        else:
            continue

**I will also drop features that I cannot really evaluate considering the knowledge I have. However I will create a variable to check if the feature is offered or not. For example, there are some listings that do not inform the "Neighborhood overview", So i will drop the overview and create a boolean variable called "neighborhood_overview_given" to indicate if it was informed. This I do manually, because I can't find a way to do it systematically for the different features.**

In [6]:
# Removing features which do no vary.
col_to_drop = []
for col in S_l_dropped3.columns.tolist():
    if len(S_l_dropped3[col].unique())==1:
        col_to_drop.append(col)

S_l_dropped4 = S_l_dropped3.drop(col_to_drop, axis=1)

**I'll remove the outliers here**

In [7]:
num_vars1 =  S_l_dropped4.select_dtypes(include=['float', 'int']).columns
for col in num_vars1:
    q = S_l_dropped4[col].quantile(0.99)
    S_l_dropped5 = S_l_dropped4[S_l_dropped4[col]<q]
S_l_amenver = S_l_dropped5.reset_index(drop=True)

In [8]:
S_l_amenver['host_since'] = (S_l_amenver['host_since']-S_l_amenver['host_since'].min()).dt.days
S_l_amenver['first_review'] = (S_l_amenver['first_review']-S_l_amenver['first_review'].min()).dt.days
S_l_amenver['last_review'] = (S_l_amenver['last_review']-S_l_amenver['last_review'].min()).dt.days

for i in range(S_l_amenver.shape[0]):
    if 'weeks ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace(' weeks ago', '')) * 7
    elif 'a week ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('a week ago', '1')) * 7
    elif '1 week ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('1 week ago', '1')) * 7
    elif 'yesterday' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('yesterday', '1')) * 1
    elif 'days ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace(' days ago', '1')) * 1
    elif 'today' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('today', '1')) * 1
    elif 'never' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('never', '1')) * 2000
    elif 'months ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace(' months ago', '')) * 30

**There are some numbers that have '$' or '\%' that are seen as strings, so I fix this.**

In [9]:
col_vars = S_l_amenver.select_dtypes(include=['object', 'O']).copy().columns
for col in col_vars:
    if '$' in S_l_amenver[col].any():
        S_l_amenver[col] = S_l_amenver[col].replace('[\$,]', '', regex=True).astype(float)

col_vars2 = S_l_amenver.select_dtypes(include=['object', 'O']).copy().columns
for col in col_vars2:
    if '%' in S_l_amenver[col].any():
        S_l_amenver[col] = S_l_amenver[col].replace('[,\%]','',regex=True).astype(float)

**I'll remove any categorical variables**

In [10]:
cat_vars0 = S_l_amenver.select_dtypes(include=['object', 'O']).copy().columns
S_l_amenver = S_l_amenver.drop(cat_vars0,axis=1)

**I define some functions here**

In [11]:
def drop_fill(df,y_data,drop_data):
    """
    df: the pandas dataframe from where the data comes from
    y_data: the string of the dependent variable name
    drop: A list of strings with the features you would like to drop
    
    Outputs:
    X: pandas data without NaN, the independent variables
    y: pandas data without Nan, the dependent variable
    num_vars: list of strings with numerical variables.
    
    This function fills missing values with means for numerical, and mode for categorical data.
    """
    drop_data.append(y_data)
    X = df.drop(drop_data, axis=1)
    y = df[y_data]
    y.fillna((y.mean()), inplace=True)
    
    num_vars = X.select_dtypes(include=['float', 'int']).columns
    for col in num_vars:
        X[col].fillna((X[col].mean()), inplace=True)
    
    cat_vars = X.select_dtypes(include=['object', 'O']).copy().columns
    for var in cat_vars:
        X[var].fillna((X[var].mode()[0]), inplace=True)
    return X, y, num_vars

In [12]:
def make_dummies(df):
    cat_vars = df.select_dtypes(include=['object', 'O']).copy().columns
    for var in cat_vars:
        # for each cat add dummy var, drop original column
        df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
    df_2= df.copy()
    return df_2

In [13]:
def vif_dropping(X_data,VIF_lim):
    """
    X_data: pandas data, independent variables without num values.
    VIF_lim: the maximum VIF value allowed to exist
    
    X_out: pandas dataframe without features VIF values above VIF_lim.
    """
    num_vars2 = X_data.select_dtypes(include=['float', 'int']).columns
    variables = X_data[num_vars2]
    vif = pd.DataFrame()
    vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
    vif_val=50
    
    while len(vif[vif['VIF']>VIF_lim]) != 0:
        num_vars2 = X_data.select_dtypes(include=['float', 'int']).columns
        vif_val=vif_val-5
        variables = X_data[num_vars2]
        vif = pd.DataFrame()
        vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
        vif["Features"] = variables.columns
        VIF_clean=vif[vif["VIF"]>vif_val].Features
        X_data = X_data.drop(VIF_clean,axis=1)
        
    X_out = X_data.copy()
    return X_out

In [86]:
def LR_model(X_data,y_data,test_sizee, rdm_state):
    """
    X_data: pandas dataframe with the independent variables
    y_data: pandas dataframe with the dependent variable
    test_sizee: size of the test sample
    rdm_state: random state for randomizer.
    
    y_test: the test targets, as a numpy array.
    y_test_preds: the predicted targets, as a numpy array.
    """
    scaler = StandardScaler()
    X_scaled = X_data.copy()
    scaler.fit(X_scaled)
    scaler.transform(X_scaled)
    
    X_columns = X_data.columns.tolist()
    X_train, X_test, y_train, y_test = train_test_split(scaler.transform(X_scaled), y_data, test_size = test_sizee, random_state=rdm_state) 
    lm_model = LinearRegression() 
    lm_model.fit(X_train, y_train) 

    y_test_preds = lm_model.predict(X_test) 
    
    return X_columns, X_train, X_test, y_train, y_test, y_test_preds

In [28]:
# Removing features which do no vary.
col_to_drop = []
for col in S_l_amenver.columns.tolist():
    if len(S_l_amenver[col].unique())==1:
        col_to_drop.append(col)

S_l_amenver = S_l_amenver.drop(col_to_drop, axis=1)

## Trying the Linear Model with sklearn

In [16]:
X,y,num_vars = drop_fill(S_l_amenver,'price',['weekly_price','monthly_price'])
#X = make_dummies(X)
# X = vif_dropping(X,10)

In [87]:
X_columns, X_train, X_test, y_train, y_test, y_test_preds = LR_model(X,y,0.2,26) 
"The r-squared score for the model using all variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for the model using all variables was 0.6409450363698318 on 632 values.'

In [33]:
S_l_amenver.shape

(3159, 37)

In [76]:
clf = SVR(kernel='linear',C=1,epsilon=0.1)
clf.fit(X_train, y_train)
clf.score(X_test,y_test)

0.6257384634552523

In [81]:
regr = AdaBoostRegressor(random_state=0)
regr.fit(X_train, y_train)
y_ada_pred = regr.predict(X_test)
regr.score(y_test,y_ada_pred)

ValueError: Expected 2D array, got 1D array instead:
array=[110. 100. 175.  85. 127. 125. 129.  51. 245.  50.  69.  99. 149.  83.
  31. 105.  70.  47. 110. 142.  90.  90.  70.  75. 100.  62. 122.  55.
  85. 150.  65.  75. 120.  80. 150. 325.  69.  60. 159.  55. 155.  65.
 195. 115.  58. 110.  75. 100. 115. 135. 119. 179. 150.  98. 498. 149.
 110. 125. 115. 175. 100. 119. 135. 175.  75.  53. 150. 200. 105.  75.
  60. 175. 150. 120. 126. 120. 129.  99. 350.  45.  90. 465.  69.  99.
 125. 150.  78. 100. 149.  90. 197. 150.  58. 100. 104.  99. 399.  90.
 149. 110. 405. 175.  99. 199. 109.  79. 155. 700.  55.  95.  97.  65.
 115.  95. 175. 110.  70. 160. 126.  80. 320.  80. 450. 150.  68.  99.
 109. 100.  89. 105.  88. 250.  65.  99.  65.  40.  60.  67. 135.  75.
 175.  65. 130.  53.  65.  93.  75. 115.  85.  55. 139. 135. 375. 115.
  40.  90. 224. 130. 140.  65. 149. 195.  84.  99. 225.  70. 285. 200.
 250.  69.  85. 300. 125.  40. 129.  65. 115. 225. 129. 155. 100.  40.
  95. 110. 100. 150. 170.  40. 225.  92.  75.  40. 250. 158.  99.  95.
 115. 175.  78.  69.  45. 250.  95. 100. 159. 120.  81.  45.  39. 114.
  40. 200. 100. 130. 110. 245. 240.  99.  65. 174. 175. 135.  90. 175.
  65. 239.  63. 190.  52.  85. 190. 117.  65. 125.  75. 159. 125.  55.
 119.  68. 115. 100.  70. 175. 210.  29. 129. 160. 125. 130.  89.  50.
  95. 350.  75.  95.  35. 120. 350.  55.  91. 200. 275.  98.  90. 125.
  90. 229. 150.  65. 170. 110.  41. 100. 275. 145. 115.  80. 115. 105.
 195.  99.  64. 100. 239.  90. 108.  50.  68.  99. 175.  29. 126. 125.
  80. 150.  75. 195.  68. 120.  58. 168.  95. 300. 110.  37. 149.  80.
 100.  80.  42. 125.  70. 349.  43. 183.  39. 100. 119.  80.  88.  95.
 140. 375. 149. 137.  47. 145.  80. 139.  70. 145. 130. 109.  39. 157.
 110.  80.  85.  60. 130. 200.  49. 156. 150.  40.  50.  85.  59. 165.
 150. 170.  95. 115.  88. 300.  49.  50. 166. 125.  65. 139.  49.  85.
 110.  28. 125. 109. 150.  65. 399. 120. 100. 140. 120. 129.  75.  80.
  55. 140. 139.  85. 280. 115.  95. 225. 220.  90. 200.  90.  65.  75.
  75.  85.  90. 115. 257.  98. 165. 175.  90.  60. 165. 175. 175.  89.
  95. 215.  68.  75. 175. 150. 115. 170. 139.  95. 150.  45. 178.  85.
 119. 120.  69. 175. 115.  68.  75.  75. 105.  75.  45. 125. 115.  75.
 125.  52.  35. 250. 119. 100. 199.  85.  60. 110. 129.  87.  69. 250.
  65. 150.  99.  75. 130. 135.  66.  50. 162.  50.  85. 250.  50. 200.
  57. 145. 150.  80. 195.  70. 139. 240.  59.  95.  45. 209. 100. 108.
  56.  80. 225. 195. 100.  49. 219.  95.  59. 180. 445.  90.  45.  30.
  85. 120. 137. 141.  50. 250. 335. 159. 109. 350. 115. 110.  89. 118.
 149. 129. 151. 360. 170.  96.  90. 175. 200. 149. 125.  50. 129.  90.
 338.  97.  65. 330.  55. 100.  49.  41. 150. 550. 175. 175. 169. 175.
  65. 150.  86.  53. 500. 129. 325.  55.  45. 300. 155. 245. 439. 115.
 119.  40. 150. 200.  95. 215.  75.  75. 108. 136.  59.  75.  65.  90.
 120.  40. 175. 129.  55. 117.  95. 110.  65. 174. 150. 200.  99.  99.
  55.  90. 117. 105. 100. 200.  90.  80.  68. 144.  85.  85. 111.  95.
 150.  38. 110. 350. 150.  70. 105.  71.  49. 100.  37. 215.  40. 120.
 325. 139.  80.  70. 139. 119.  38. 132. 110.  85. 135. 129.  80. 300.
 120. 250.  69.  62. 120.  66.  69.  35. 119.  90.  39.  65. 100. 100.
  40.  65.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
for col in normed_train_data.columns.tolist():
    if normed_train_data[col].isna().any():
        print(col)

In [93]:
X_train2= pd.DataFrame(data=X_train,columns=X_columns)
X_train2

Unnamed: 0,host_since,host_response_rate,host_acceptance_rate,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,beds,...,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,reviews_per_month
0,1.327065,4.485599e-01,2.103086e-02,-0.230074,2.021838,-1.062650,0.303695,-0.438445,-0.353323,0.220012,...,0.497284,0.828255,0.525314,0.559720,0.360738,0.337869,0.623998,0.733511,-0.333607,0.995673
1,1.262966,-1.881057e+00,2.103086e-02,-0.230074,-0.968795,-1.490504,1.305126,2.080944,0.785765,1.084461,...,-0.449784,0.374495,0.525314,-0.696443,0.360738,0.337869,0.623998,0.733511,-0.333607,-0.426506
2,1.223794,1.324234e-15,7.988703e-15,-0.116083,0.754214,-0.325927,-1.198452,-0.438445,-0.353323,-0.644437,...,-0.290464,0.828255,0.525314,0.559720,0.360738,0.337869,0.623998,0.733511,-0.333607,-1.027511
3,0.958496,2.621906e-01,2.103086e-02,0.529862,0.879789,0.524697,-1.198452,-0.438445,-0.353323,-0.644437,...,0.576944,-0.835531,-0.909779,-0.696443,-1.320915,-1.421632,-0.969589,-0.602085,3.019978,1.781144
4,0.518706,1.324234e-15,7.988703e-15,-0.230074,-0.363930,0.295008,-0.697737,-0.438445,-1.492411,-0.644437,...,0.045878,-2.499317,-0.909779,-1.952605,-1.320915,-3.181132,-0.969589,-1.937680,-0.333607,-0.075424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2522,1.473068,-4.832867e-01,2.103086e-02,-0.230074,-1.514035,-0.934744,-0.697737,-0.438445,-0.353323,-0.644437,...,0.497284,0.828255,0.525314,0.559720,0.360738,0.337869,0.623998,0.733511,-0.333607,-0.004018
2523,1.341309,4.485599e-01,2.103086e-02,-0.230074,-1.283435,-2.598542,-0.697737,-0.438445,-0.353323,-0.644437,...,0.630051,0.828255,0.525314,0.559720,0.360738,0.337869,0.623998,0.733511,-0.333607,-0.099227
2524,1.019034,4.485599e-01,2.103086e-02,-0.192077,-2.538499,-0.849491,-0.197021,-0.438445,-0.353323,-0.644437,...,0.231751,0.828255,0.525314,0.559720,0.360738,0.337869,0.623998,0.733511,-0.165928,-0.670478
2525,0.053988,4.485599e-01,2.103086e-02,-0.230074,-0.514914,0.222120,-0.697737,-0.438445,-0.353323,-0.644437,...,0.523838,0.374495,0.525314,0.559720,0.360738,0.337869,0.623998,0.733511,-0.333607,2.471406


In [92]:
BATCH_SIZE = 100
hidden_layer_size = 200
NUM_EPOCHS = 50
output_size = 1

model = tf.keras.Sequential([    
    tf.keras.layers.Dense(hidden_layer_size, activation='relu', input_shape=[len(X_train2.keys())]),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    
    tf.keras.layers.Dense(output_size, activation='relu') # output layer
])

""" 
Now we have to choose the optimizer and the loss function.

metrics will show us what we are interested in obtaining at each iteration.
"""
model.compile(optimizer='adam', loss='mse',metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(patience=5)

model.fit(X_train2, 
          y_train, 
          batch_size = BATCH_SIZE, 
          epochs = NUM_EPOCHS, 
          callbacks = [early_stopping],
          validation_split = 0.2,
          verbose =2)

Train on 2021 samples, validate on 506 samples
Epoch 1/50
2021/2021 - 0s - loss: 24252.8223 - accuracy: 0.0000e+00 - val_loss: 21093.4818 - val_accuracy: 0.0000e+00
Epoch 2/50
2021/2021 - 0s - loss: 19884.8513 - accuracy: 0.0000e+00 - val_loss: 15397.0534 - val_accuracy: 0.0000e+00
Epoch 3/50
2021/2021 - 0s - loss: 12151.2607 - accuracy: 0.0000e+00 - val_loss: 7610.4523 - val_accuracy: 0.0000e+00
Epoch 4/50
2021/2021 - 0s - loss: 5842.0853 - accuracy: 0.0000e+00 - val_loss: 5336.3702 - val_accuracy: 0.0000e+00
Epoch 5/50
2021/2021 - 0s - loss: 5004.6235 - accuracy: 0.0000e+00 - val_loss: 5326.7561 - val_accuracy: 0.0000e+00
Epoch 6/50
2021/2021 - 0s - loss: 4723.8133 - accuracy: 0.0000e+00 - val_loss: 5216.8753 - val_accuracy: 0.0000e+00
Epoch 7/50
2021/2021 - 0s - loss: 4534.7712 - accuracy: 0.0000e+00 - val_loss: 5268.5714 - val_accuracy: 0.0000e+00
Epoch 8/50
2021/2021 - 0s - loss: 4403.1729 - accuracy: 0.0000e+00 - val_loss: 5316.6331 - val_accuracy: 0.0000e+00
Epoch 9/50
2021/2021

<tensorflow.python.keras.callbacks.History at 0x18790f99508>