# Baseline Modeling

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [2]:
groupdf = pd.read_csv('./data/groupdf.csv')

In [3]:
groupdf.ty

Unnamed: 0,location,totalvalue,latitude_x,longitude_x,logvalue,review_count,latitude_y,longitude_y,log_reviews,price_1.0,...,type_Venues,type_Vietnamese,type_Vitaminssupplements,type_Waffles,type_Whiskeybars,type_Wine_Bars,type_Winetastingroom,type_Womenscloth,type_Wraps,type_Yoga
0,90001,292490.6,33.968543,-118.261693,12.564232,8588,6353.401556,-22112.494475,502.254649,135.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,90002,287087.7,33.946024,-118.250578,12.551515,1081,1459.928001,-5084.555034,98.1702,39.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,90003,297284.7,33.961248,-118.273066,12.573247,4701,4755.757869,-16558.215243,383.669881,118.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,90004,1024344.0,34.077047,-118.313083,13.743391,69031,7939.550715,-27565.505661,1115.333722,98.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,90005,1113551.0,34.058708,-118.319786,13.858325,106745,7901.772113,-27445.860659,1236.250662,73.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Setting our primary explanatory variables for our baseline model with just the top 6 restaurant types

In [6]:
X = groupdf[['price_1.0',
             'price_2.0', 
            'price_3.0', 
            'price_4.0', 
            'rating_1.0', 
            'rating_2.0', 
            'rating_2.5', 
            'rating_3.0',
            'rating_3.5', 
            'rating_4.0', 
            'rating_4.5', 
            'rating_5.0', 
           'type_Mexican',
           'type_Coffee',
           'type_Pizza',
           'type_Hotdogs',
           'type_Burgers',
           'type_Bakeries',
            'log_reviews']]

Setting our outcome variable as the log-transformed home values

In [7]:
y = groupdf.logvalue

Instantiating a Linear regression model, train/test splitting, and scaling the model.

In [10]:
lr = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
sc = StandardScaler()

In [12]:
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [13]:
lr.fit(X_train_sc, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [14]:
lr.score(X_train_sc, y_train)

0.694967392695512

In [15]:
lr.score(X_test_sc, y_test)

-0.0974639276197198

In [None]:
So it seems like our baseline linear regression model is very overfit on the training set and due to multicollinearity issues, fails to converge on the 