### Imported all the necessary data, and loaded in the data set

In [40]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set_theme(style="darkgrid")
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import *
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm


import warnings
warnings.filterwarnings("ignore")

path = '../../../data/kc_housing_data.csv'
all_data = pd.read_csv('https://raw.githubusercontent.com/learn-co-students/bsc-phase-two-project/main/data/kc_house_data.csv')

In [12]:
all_data.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


In [3]:
all_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

### Dropping Irrelevant Columns

In [4]:
df = all_data.drop(['view', 'zipcode', 'lat', 'long', 'sqft_above', 'sqft_living15', 'sqft_lot15'], axis=1)

In [5]:
df.yr_renovated

0           0.0
1        1991.0
2           NaN
3           0.0
4           0.0
          ...  
21592       0.0
21593       0.0
21594       0.0
21595       0.0
21596       0.0
Name: yr_renovated, Length: 21597, dtype: float64

In [8]:
years = [int(x[-4:]) for x in df['date']]
df['date'] = years

In [13]:
df.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_basement,yr_built,yr_renovated
0,7129300520,2014,221900.0,3,1.0,1180,5650,1.0,,3,7,0.0,1955,0.0
1,6414100192,2014,538000.0,3,2.25,2570,7242,2.0,0.0,3,7,400.0,1951,1991.0
2,5631500400,2015,180000.0,2,1.0,770,10000,1.0,0.0,3,6,0.0,1933,
3,2487200875,2014,604000.0,4,3.0,1960,5000,1.0,0.0,5,7,910.0,1965,0.0
4,1954400510,2015,510000.0,3,2.0,1680,8080,1.0,0.0,3,8,0.0,1987,0.0


In [24]:
df.corr()['price'].map(abs).sort_values(ascending=False)


price           1.000000
sqft_living     0.701917
grade           0.667951
bathrooms       0.525906
bedrooms        0.308787
waterfront      0.276295
floors          0.256804
yr_renovated    0.129599
sqft_lot        0.089876
yr_built        0.053953
condition       0.036056
id              0.016772
date            0.003727
Name: price, dtype: float64

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  int64  
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  float64
 9   condition      21597 non-null  int64  
 10  grade          21597 non-null  int64  
 11  sqft_basement  21597 non-null  object 
 12  yr_built       21597 non-null  int64  
 13  yr_renovated   17755 non-null  float64
dtypes: float64(5), int64(8), object(1)
memory usage: 2.3+ MB


In [65]:
X = sm.add_constant(df['sqft_living'])
y = df['price']

model = sm.OLS(y,X)

In [66]:
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.493
Model:,OLS,Adj. R-squared:,0.493
Method:,Least Squares,F-statistic:,20970.0
Date:,"Mon, 28 Jun 2021",Prob (F-statistic):,0.0
Time:,14:46:02,Log-Likelihood:,-300060.0
No. Observations:,21597,AIC:,600100.0
Df Residuals:,21595,BIC:,600100.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.399e+04,4410.023,-9.975,0.000,-5.26e+04,-3.53e+04
sqft_living,280.8630,1.939,144.819,0.000,277.062,284.664

0,1,2,3
Omnibus:,14801.942,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,542662.604
Skew:,2.82,Prob(JB):,0.0
Kurtosis:,26.901,Cond. No.,5630.0
