## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.svm import SVR
import seaborn as sns
import re
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline

## Extracting Seatle's data

In [None]:
S_calendar_data = pd.read_csv('Seatle_calendar.csv')

In [None]:
S_l_data = pd.read_csv('Seatle_listings.csv')

In [None]:
S_reviews_data = pd.read_csv('Seatle_reviews.csv')

## Cleaning the listings data

In [None]:
# Removing url features
col_url = []
for col in S_l_data.columns.tolist():
    if '_url' in col:
        col_url.append(col)

S_l_dropped2 = S_l_data.drop(col_url, axis=1)

**I am removing some features that obviously lead to nothing, such as #id and 'name'.**

**I have to explain some of the removals, such as 'street', 'zipcode', 'host_listings_count', 'latitude', 'neighbourhood_cleansed', neighbourhood_group_cleansed**

In [None]:
S_l_dropped3 = S_l_dropped2.drop(['#id','name','host_id','host_name','host_id','smart_location','state', 
                                  'host_listings_count','street', 'host_location', 
                                  'host_neighbourhood', 'neighbourhood_group_cleansed','neighbourhood_cleansed','city'],axis=1)

**I also need to transform the dates that are in string format to date format**

In [None]:
r = re.compile('.{4}-.{2}-.{2}')

for col in S_l_dropped3.columns.tolist():
    if S_l_dropped3[col].dtypes == 'O':
        if r.match(S_l_dropped3[col].any()):
            S_l_dropped3[col] = pd.to_datetime(S_l_dropped3[col], errors='ignore')
        else:
            continue

**I will also drop features that I cannot really evaluate considering the knowledge I have. However I will create a variable to check if the feature is offered or not. For example, there are some listings that do not inform the "Neighborhood overview", So i will drop the overview and create a boolean variable called "neighborhood_overview_given" to indicate if it was informed. This I do manually, because I can't find a way to do it systematically for the different features.**

In [None]:
col_to_drop2 = ['summary', 'space', 'neighborhood_overview', 'description', 'transit', 'notes', 'host_about']

for col in col_to_drop2:
    S_l_dropped3['isthere_'+col] = ''
    S_l_dropped3['isthere_'+col] = pd.Series(S_l_dropped3[col].isna().to_numpy(dtype=int))


S_l_dropped4 = S_l_dropped3.drop(col_to_drop2, axis=1)

In [None]:
# Removing features which do no vary.
col_to_drop = []
for col in S_l_dropped4.columns.tolist():
    if len(S_l_dropped4[col].unique())==1:
        col_to_drop.append(col)

S_l_dropped4 = S_l_dropped4.drop(col_to_drop, axis=1)

**I'll remove the outliers here**

In [None]:
num_vars1 =  S_l_dropped4.select_dtypes(include=['float', 'int']).columns
for col in num_vars1:
    q = S_l_dropped4[col].quantile(0.99)
    S_l_dropped5 = S_l_dropped4[S_l_dropped4[col]<q]
S_l_dropped5 = S_l_dropped5.reset_index(drop=True)

**Now I transform the amenities and host verifications into several columns with 1 for present, an 0 for not**

In [None]:
for i in range(S_l_dropped5.shape[0]):
    if '{'  in S_l_dropped5['amenities'][i]:
        S_l_dropped5.loc[i,'amenities'] = S_l_dropped5['amenities'][i].replace('{','')
    if '}' in S_l_dropped5['amenities'][i]:
        S_l_dropped5.loc[i,'amenities'] = S_l_dropped5['amenities'][i].replace('}','')
    if '"' in S_l_dropped5['amenities'][i]:
        S_l_dropped5.loc[i,'amenities'] = S_l_dropped5['amenities'][i].replace('"','')

all_am=np.array([])
for i in range(S_l_dropped5.shape[0]):
    all_am = np.append(all_am, S_l_dropped5['amenities'][i].split(','))

all_am = np.unique(all_am)
all_am = all_am[1:] # This is because it created a '' (empty) case

In [None]:
for val in all_am:
    S_l_dropped5['amen_'+val] = ''

S_l_amen = S_l_dropped5.copy()
#for i in range(S_l_dropped5.shape[0]):
#    for val in all_am:
#        if val in S_l_dropped5['amenities'][i]:
#            S_l_amen.loc[i,'amen_'+val] = 1        
#        else:
#            S_l_amen.loc[i,'amen_'+val] = 0

for val in all_am:
    S_l_amen['amen_'+val] = pd.Series(S_l_amen['amenities'].str.contains(val).to_numpy(dtype=int))
    
for i in range(S_l_amen.shape[0]):
    if '['  in S_l_amen['host_verifications'][i]:
        S_l_amen.loc[i,'host_verifications'] = S_l_amen['host_verifications'][i].replace('[','')
    if ']' in S_l_dropped5['host_verifications'][i]:
        S_l_amen.loc[i,'host_verifications'] = S_l_amen['host_verifications'][i].replace(']','')
    if '"' in S_l_dropped5['host_verifications'][i]:
        S_l_amen.loc[i,'host_verifications'] = S_l_amen['host_verifications'][i].replace('"','')
    if "'" in S_l_dropped5['host_verifications'][i]:
        S_l_amen.loc[i,'host_verifications'] = S_l_amen['host_verifications'][i].replace("'",'')
    if " " in S_l_dropped5['host_verifications'][i]:
        S_l_amen.loc[i,'host_verifications'] = S_l_amen['host_verifications'][i].replace(" ",'')

all_ver=np.array([])
for i in range(S_l_amen.shape[0]):
    all_ver = np.append(all_ver, S_l_amen['host_verifications'][i].split(','))
    
all_ver = np.unique(all_ver)
all_ver = all_ver[1:] # This is because it created a '' (empty) case, also because there is a 'None' verification

for val in all_ver:
    S_l_amen['verif_' + val] = ''

S_l_amenver = S_l_amen.copy()

for val in all_ver:
    S_l_amenver['verif_'+val] = pd.Series(S_l_amenver['host_verifications'].str.contains(val).to_numpy())

In [None]:
S_l_amenver = S_l_amenver.drop(['host_verifications','amenities','isthere_host_about'],axis=1)

**I also drop any amenities or host verification, that less than 1% of the listings have (a.k.a. outliers)**

In [None]:
S_l_amenver.columns.tolist()

In [None]:
col_to_drop3=[]
for val1 in all_am:
    if S_l_amenver['amen_' + val1].sum() < int(0.01*S_l_amenver.shape[0]):
        col_to_drop3.append('amen_'+val1)
S_l_amenver = S_l_amenver.drop(col_to_drop3,axis=1)

col_to_drop4=[]

for val1 in all_ver:
    if S_l_amenver['verif_' + val1].sum() < int(0.01*S_l_amenver.shape[0]):
        col_to_drop4.append('verif_'+val1)
col_to_drop4
S_l_amenver = S_l_amenver.drop(col_to_drop4,axis=1)

**I make the features with 't' and 'f' values into 1 and 0's.**

In [None]:
col_tf = []
for col in S_l_amenver.columns.tolist():
    if ('f' or 't') in S_l_amenver[col].unique():
        col_tf.append(col)    

for col in col_tf:
    S_l_amenver[col] = S_l_amenver[col].replace(['t','f'],[int(1),int(0)])

**Here I handle the dates, making them into a difference between dates (ends up as an integer)**

In [None]:
S_l_amenver['host_since'] = (S_l_amenver['host_since']-S_l_amenver['host_since'].min()).dt.days
S_l_amenver['first_review'] = (S_l_amenver['first_review']-S_l_amenver['first_review'].min()).dt.days
S_l_amenver['last_review'] = (S_l_amenver['last_review']-S_l_amenver['last_review'].min()).dt.days

for i in range(S_l_amenver.shape[0]):
    if 'weeks ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace(' weeks ago', '')) * 7
    elif 'a week ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('a week ago', '1')) * 7
    elif '1 week ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('1 week ago', '1')) * 7
    elif 'yesterday' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('yesterday', '1')) * 1
    elif 'days ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace(' days ago', '1')) * 1
    elif 'today' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('today', '1')) * 1
    elif 'never' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace('never', '1')) * 2000
    elif 'months ago' in S_l_amenver['calendar_updated'][i]:
        S_l_amenver.loc[i,'calendar_updated'] = float(S_l_amenver['calendar_updated'][i].replace(' months ago', '')) * 30

**There are some numbers that have '$' or '\%' that are seen as strings, so I fix this.**

In [None]:
col_vars = S_l_amenver.select_dtypes(include=['object', 'O']).copy().columns
for col in col_vars:
    if '$' in S_l_amenver[col].any():
        S_l_amenver[col] = S_l_amenver[col].replace('[\$,]', '', regex=True).astype(float)

col_vars2 = S_l_amenver.select_dtypes(include=['object', 'O']).copy().columns
for col in col_vars2:
    if '%' in S_l_amenver[col].any():
        S_l_amenver[col] = S_l_amenver[col].replace('[,\%]','',regex=True).astype(float)

**I define some functions here**

In [None]:
def drop_fill(df,y_data,drop_data):
    """
    df: the pandas dataframe from where the data comes from
    y_data: the string of the dependent variable name
    drop: A list of strings with the features you would like to drop
    
    Outputs:
    X: pandas data without NaN, the independent variables
    y: pandas data without Nan, the dependent variable
    num_vars: list of strings with numerical variables.
    
    This function fills missing values with means for numerical, and mode for categorical data.
    """
    drop_data.append(y_data)
    X = df.drop(drop_data, axis=1)
    y = df[y_data]
    y.fillna((y.mean()), inplace=True)
    
    num_vars = X.select_dtypes(include=['float', 'int']).columns
    for col in num_vars:
        X[col].fillna((X[col].mean()), inplace=True)
    
    cat_vars = X.select_dtypes(include=['object', 'O']).copy().columns
    for var in cat_vars:
        X[var].fillna((X[var].mode()[0]), inplace=True)
    return X, y, num_vars

In [None]:
def make_dummies(df):
    cat_vars = df.select_dtypes(include=['object', 'O']).copy().columns
    for var in cat_vars:
        # for each cat add dummy var, drop original column
        df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
    df_2= df.copy()
    return df_2

In [None]:
def vif_dropping(X_data,VIF_lim):
    """
    X_data: pandas data, independent variables without num values.
    VIF_lim: the maximum VIF value allowed to exist
    
    X_out: pandas dataframe without features VIF values above VIF_lim.
    """
    num_vars2 = X_data.select_dtypes(include=['float', 'int']).columns
    variables = X_data[num_vars2]
    vif = pd.DataFrame()
    vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
    vif_val=50
    
    while len(vif[vif['VIF']>VIF_lim]) != 0:
        num_vars2 = X_data.select_dtypes(include=['float', 'int']).columns
        vif_val=vif_val-5
        variables = X_data[num_vars2]
        vif = pd.DataFrame()
        vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
        vif["Features"] = variables.columns
        VIF_clean=vif[vif["VIF"]>vif_val].Features
        X_data = X_data.drop(VIF_clean,axis=1)
        
    X_out = X_data.copy()
    return X_out

In [None]:
def LR_model(X_data,y_data,test_sizee, rdm_state):
    """
    X_data: pandas dataframe with the independent variables
    y_data: pandas dataframe with the dependent variable
    test_sizee: size of the test sample
    rdm_state: random state for randomizer.
    
    y_test: the test targets, as a numpy array.
    y_test_preds: the predicted targets, as a numpy array.
    """
    scaler = StandardScaler()
    X_scaled = X_data.copy()
    scaler.fit(X_scaled)
        
    polynomial_features= PolynomialFeatures(degree=3)
    x_poly = polynomial_features.fit_transform(scaler.transform(X_scaled))
    
    X_train, X_test, y_train, y_test = train_test_split(x_poly, y_data, test_size = test_sizee, random_state=rdm_state) 
    lm_model = LinearRegression() 
    lm_model.fit(X_train, y_train) 

    y_test_preds = lm_model.predict(X_test) 
    
    return X_train, X_test, y_train, y_test, y_test_preds

In [None]:
# Removing features which do no vary.
col_to_drop = []
for col in S_l_amenver.columns.tolist():
    if len(S_l_amenver[col].unique())==1:
        col_to_drop.append(col)

S_l_amenver = S_l_amenver.drop(col_to_drop, axis=1)

## Trying the Linear Model with sklearn

In [None]:
X,y,num_vars = drop_fill(S_l_amenver,'price',['weekly_price','monthly_price'])
X = make_dummies(X)
# X = vif_dropping(X,10)

In [None]:
X_train, X_test, y_train, y_test, y_test_preds = LR_model(X,y,0.2,42) 
"The r-squared score for the model using all variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

In [None]:
X.columns.tolist()

## Scaling the data

In [None]:
clf = SVR(kernel='sigmoid',C=20,epsilon=0.1)
clf.fit(X_train, y_train)
clf.score(X_test,y_test)

In [None]:
for col in normed_train_data.columns.tolist():
    if normed_train_data[col].isna().any():
        print(col)

In [None]:
normed_train_data[0:10]

In [None]:
BATCH_SIZE = 100
hidden_layer_size = 200
NUM_EPOCHS = 50
output_size = 1

model = tf.keras.Sequential([    
    tf.keras.layers.Dense(hidden_layer_size, activation='relu', input_shape=[len(normed_train_data.keys())]),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    
    tf.keras.layers.Dense(output_size, activation='relu') # output layer
])

""" 
Now we have to choose the optimizer and the loss function.

metrics will show us what we are interested in obtaining at each iteration.
"""
model.compile(optimizer='adam', loss='mse',metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(patience=5)

model.fit(normed_train_data, 
          y_train, 
          batch_size = BATCH_SIZE, 
          epochs = NUM_EPOCHS, 
          callbacks = [early_stopping],
          validation_split = 0.2,
          verbose =2)