In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm


### Data Analysis Test Set 1

In [2]:
test_set1_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test1.csv', index_col=0)

In [18]:
numeric_df = test_set1_df.select_dtypes(include=[np.number])

In [26]:
columns = list(numeric_df.columns)

In [27]:
columns.remove('rent')
columns.append('const')

In [28]:
columns

['building_id',
 'bedrooms',
 'bathrooms',
 'size_sqft',
 'addr_zip',
 'addr_lat',
 'addr_lon',
 'bin',
 'bbl',
 'floor_count',
 'year_built',
 'min_to_subway',
 'has_doorman',
 'has_elevator',
 'has_fireplace',
 'has_dishwasher',
 'is_furnished',
 'has_gym',
 'allows_pets',
 'has_washer_dryer',
 'has_garage',
 'has_roofdeck',
 'has_concierge',
 'has_pool',
 'has_garden',
 'has_childrens_playroom',
 'no_fee',
 'floornumber',
 'const']

In [29]:
# Add a constant to our existing dataframe for modeling purposes
fixed_set = numeric_df.dropna(axis=0)
fixed_set = sm.add_constant(fixed_set)

In [30]:
fixed_set['rent'].shape

(1696,)

In [31]:
fixed_set[columns].shape

(1696, 29)

In [32]:
est = sm.OLS(fixed_set['rent'], 
             fixed_set[columns].astype(float)).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:                   rent   R-squared:                       0.558
Model:                            OLS   Adj. R-squared:                  0.550
Method:                 Least Squares   F-statistic:                     75.02
Date:                Thu, 14 Nov 2019   Prob (F-statistic):          5.96e-271
Time:                        12:37:14   Log-Likelihood:                -15131.
No. Observations:                1696   AIC:                         3.032e+04
Df Residuals:                    1667   BIC:                         3.048e+04
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
building_id            -3.16

### Data Analysis Train Set 1

In [35]:
train_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_train.csv', index_col=0)

In [38]:
train_numeric_df = train_df.select_dtypes(include=[np.number])

In [39]:
train_fixed_set = train_numeric_df.dropna(axis=0)
train_fixed_set = sm.add_constant(train_fixed_set)

In [40]:
train_fixed_set['rent'].shape

(10221,)

In [41]:
train_fixed_set[columns].shape

(10221, 29)

In [42]:
est = sm.OLS(train_fixed_set['rent'], 
             train_fixed_set[columns].astype(float)).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:                   rent   R-squared:                       0.654
Model:                            OLS   Adj. R-squared:                  0.653
Method:                 Least Squares   F-statistic:                     687.4
Date:                Thu, 14 Nov 2019   Prob (F-statistic):               0.00
Time:                        12:46:56   Log-Likelihood:                -90314.
No. Observations:               10221   AIC:                         1.807e+05
Df Residuals:                   10192   BIC:                         1.809e+05
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
building_id            -7.68