In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
plt.style.use('seaborn')

df = pd.read_csv("kc_house_data.csv")

## Ideas/Thoughts:
 - yr_renovated is currently a float - should be int, with NaN converted to 0
 - 2376 null waterfront values - should convert to 0?? also waterfront is float not int
 - sqft_basement is float, but other 3 sqft columns are int
 - sqft_basement has ? as null values (can derive this from sqft_living = sqft_above + sqft_basement)
 - total sqft = sqft_living + sqft_lot ??
 - house id 2402100895 says it has 33 bedrooms (appears to be a dentist's office, via google maps?)
 - have both 0 and NaN as nulls in yr_renovated (17011 have 0, 3842 have NaN)
 
## To Do:
 - plot prices vs category of view or condition (think factor plots, https://elitedatascience.com/python-seaborn-tutorial)
 - consider dropping yr_renovated and waterfront b/c sheer number of nulls
 - use some kind of iteration to go through and replace the ? values in sqft_basement to (sqft_living - sqft_above)


In [None]:
df.info()

In [None]:
df.size

In [None]:
df.head(10)

In [None]:
df.isna().sum()

In [None]:
clean_df = df.fillna(df['waterfront'].median())

In [None]:
clean_df['waterfront'] = clean_df['waterfront'].apply(np.int64)

In [None]:
df['waterfront'].value_counts()

In [None]:
clean_df = clean_df.fillna(clean_df['yr_renovated'].median())

In [None]:
clean_df['yr_renovated'] = clean_df['yr_renovated'].apply(np.int64)

In [None]:
clean_df.head()

In [None]:
sqft_df = df.filter(["sqft_living", "sqft_lot", "sqft_above", "sqft_basement"], axis=1)

In [None]:
sqft_df.dtypes

In [None]:
print(len(df.id))

In [None]:
print(len(set(df.id)))

In [None]:
df.columns

In [None]:
clean_df.dtypes

In [None]:
df['zipcode'].unique()

In [None]:
df.sort_values(by = 'bedrooms', ascending = False)

In [None]:
# Looking at location-focused independent variables
formula_loc = "price ~ waterfront+view+lat+long+zipcode+sqft_living15+sqft_lot15"
model_loc = ols(formula= formula_loc, data=df).fit()
model_loc.summary()

In [None]:
# The below code doesn't work! date needs to be reformatted - so, we should 
# definitely add a unix time column, will also need to check this format

#Looking at time-focused variables
# formula_time = "price ~ date+yr_built+yr_renovated"
# model_time = ols(formula= formula_time, data=df).fit()
# model_time.summary()

In [None]:
#Looking at houst quality-focused variables
formula_qual = "price ~ bedrooms+bathrooms+floors+view+waterfront+grade+condition"
model_qual = ols(formula= formula_qual, data=df).fit()
model_qual.summary()