In [3]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm

# %matplotlib inline

## Data Description
1. Trainning Set
    - 12K samples of house posted at May, June, July of 2018 at StreetEasy
2. Test Set 1
    - Samples of house posted at August 2018 with observed rent
    - 2000 Samples
3. Test Set 2 and Test Set 3
    - Samples of house posted at August 2018 without observed rent
    - 2000 Samples

In [4]:
train_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_train.csv', index_col=0)
test_set1_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test1.csv', index_col=0)
test_set2_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test2.csv', index_col=0)
# test_set3_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test3.csv', index_col=0)


In [6]:
train_df.head(2).transpose()

rental_id,7236931,7331920
addr_unit,#22A,#406
building_id,551248,3373
bedrooms,4,0
bathrooms,3,1
size_sqft,1300,475
created_at,2018-06-04 16:31:06,2018-07-03 20:11:43
addr_street,645 OCEAN AVENUE,93 WORTH STREET
addr_city,Brooklyn,New York
addr_zip,11226,10013
addr_lat,40.647,40.7166


In [7]:
train_df.describe()

Unnamed: 0,building_id,bedrooms,bathrooms,size_sqft,addr_zip,addr_lat,addr_lon,bin,bbl,floor_count,...,has_washer_dryer,has_garage,has_roofdeck,has_concierge,has_pool,has_garden,has_childrens_playroom,rent,no_fee,floornumber
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,11999.0,12000.0,12000.0,...,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,10723.0
mean,1529133.0,1.657667,1.2395,850.46575,10682.779667,40.727507,-73.956837,2370093.0,2287120000.0,10.266608,...,0.271667,0.17,0.264083,0.208583,0.073167,0.132083,0.0905,3605.762917,0.49,6.182738
std,4301123.0,1.079771,0.560208,515.773101,593.370964,0.053241,0.043024,1242919.0,1202105000.0,12.019787,...,0.444837,0.375648,0.440862,0.406313,0.260421,0.338595,0.286909,2897.069446,0.499921,7.671545
min,73.0,0.0,0.0,0.0,10001.0,40.573898,-74.14157,1000000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1250.0,0.0,0.0
25%,62092.0,1.0,1.0,610.0,10023.0,40.693256,-73.986076,1052139.0,1011220000.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2250.0,0.0,2.0
50%,249581.5,2.0,1.0,800.0,11103.0,40.729028,-73.9617,3019358.0,3006580000.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2900.0,0.0,3.0
75%,822513.2,2.0,1.0,1000.0,11218.0,40.764599,-73.936663,3326098.0,3050580000.0,12.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3900.0,1.0,6.0
max,18772100.0,8.0,20.0,10000.0,11694.0,40.909842,-73.73055,5158986.0,5010640000.0,90.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50000.0,1.0,78.0


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 7236931 to 7422694
Data columns (total 38 columns):
addr_unit                 11909 non-null object
building_id               12000 non-null int64
bedrooms                  12000 non-null int64
bathrooms                 12000 non-null float64
size_sqft                 12000 non-null float64
created_at                12000 non-null object
addr_street               12000 non-null object
addr_city                 12000 non-null object
addr_zip                  12000 non-null int64
addr_lat                  12000 non-null float64
addr_lon                  12000 non-null float64
bin                       11999 non-null float64
bbl                       12000 non-null int64
floor_count               12000 non-null float64
year_built                11597 non-null float64
min_to_subway             11874 non-null float64
has_doorman               12000 non-null int64
has_elevator              12000 non-null int64
has_fireplace    

In [7]:
test_set1_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 7499437 to 7463404
Data columns (total 38 columns):
addr_unit                 1981 non-null object
building_id               2000 non-null int64
bedrooms                  2000 non-null int64
bathrooms                 2000 non-null float64
size_sqft                 2000 non-null float64
created_at                2000 non-null object
addr_street               2000 non-null object
addr_city                 2000 non-null object
addr_zip                  2000 non-null int64
addr_lat                  2000 non-null float64
addr_lon                  2000 non-null float64
bin                       2000 non-null float64
bbl                       2000 non-null int64
floor_count               2000 non-null float64
year_built                1933 non-null float64
min_to_subway             1983 non-null float64
has_doorman               2000 non-null int64
has_elevator              2000 non-null int64
has_fireplace             2000 non-n

In [8]:
test_set2_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 7428577 to 7443274
Data columns (total 38 columns):
addr_unit                 1971 non-null object
building_id               2000 non-null int64
bedrooms                  2000 non-null int64
bathrooms                 2000 non-null float64
size_sqft                 2000 non-null float64
created_at                2000 non-null object
addr_street               2000 non-null object
addr_city                 2000 non-null object
addr_zip                  2000 non-null int64
addr_lat                  2000 non-null float64
addr_lon                  2000 non-null float64
bin                       2000 non-null float64
bbl                       2000 non-null int64
floor_count               2000 non-null float64
year_built                1918 non-null float64
min_to_subway             1984 non-null float64
has_doorman               2000 non-null int64
has_elevator              2000 non-null int64
has_fireplace             2000 non-n