# Importing and Reading Files

In [19]:
import geopy 
from geopy.geocoders import Nominatim 
import time
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
#main housing data
df = pd.read_csv('data/kc_house_data.csv')

In [3]:
## Do an inner join and groupby on this
#first import zipcode csv from king co website
dfz = pd.read_csv('data/Zipcodes_for_King_County_and_Surrounding_Area_(Shorelines)___zipcode_shore_area.csv')
dfz.head()

Unnamed: 0,OBJECTID,ZIP,ZIPCODE,COUNTY,ZIP_TYPE,COUNTY_NAME,PREFERRED_CITY,Shape_Length,Shape_Area
0,1,98001,98001,33,Standard,King County,AUBURN,147643.355747,525313300.0
1,2,98002,98002,33,Standard,King County,AUBURN,104440.086791,205302800.0
2,3,98003,98003,33,Standard,King County,FEDERAL WAY,123619.720418,316939500.0
3,4,98004,98004,33,Standard,King County,BELLEVUE,109051.119162,201759800.0
4,5,98005,98005,33,Standard,King County,BELLEVUE,116554.737369,211433700.0


In [4]:
df['coords'] = list(zip(df['lat'],df['long']))

In [5]:
X = df.drop(columns = ['price'])
y = df['price']

In [6]:
df['zipcodes'] = df['address'].apply(lambda x: x[-20:-15])

In [7]:
# zipcodes in King Co imported table in integer format, but string in our dataframe
# need to convert zipcode column into integer
df['zipcodes'] = df['zipcodes'].apply(lambda x: int(x))

In [8]:
dfz = dfz[['ZIPCODE', 'PREFERRED_CITY']]

In [9]:
# mergeeeee
df = df.merge(dfz, how = 'left', left_on = 'zipcodes', right_on = 'ZIPCODE')

In [10]:
# WE REDUCED OUR CATEGORICALS TO 40 FROM 400

In [11]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31761 entries, 0 to 32722
Data columns (total 29 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              31761 non-null  int64  
 1   date            31761 non-null  object 
 2   price           31761 non-null  float64
 3   bedrooms        31761 non-null  int64  
 4   bathrooms       31761 non-null  float64
 5   sqft_living     31761 non-null  int64  
 6   sqft_lot        31761 non-null  int64  
 7   floors          31761 non-null  float64
 8   waterfront      31761 non-null  object 
 9   greenbelt       31761 non-null  object 
 10  nuisance        31761 non-null  object 
 11  view            31761 non-null  object 
 12  condition       31761 non-null  object 
 13  grade           31761 non-null  object 
 14  heat_source     31761 non-null  object 
 15  sewer_system    31761 non-null  object 
 16  sqft_above      31761 non-null  int64  
 17  sqft_basement   31761 non-null 

# Test Train Split

In [66]:
df['grade'].unique()

array(['7 Average', '9 Better', '8 Good', '6 Low Average', '10 Very Good',
       '5 Fair', '11 Excellent', '12 Luxury', '4 Low', '13 Mansion',
       '3 Poor', '2 Substandard'], dtype=object)

In [12]:
df['condition'].unique()

array(['Good', 'Average', 'Very Good', 'Fair', 'Poor'], dtype=object)

In [13]:
df['condition'].head()

0       Good
1    Average
2    Average
3    Average
4    Average
Name: condition, dtype: object

In [70]:
df['condition'] = df['condition'].astype('category')
df['grade'] = df['grade'].astype('category')

In [71]:
df['condition'] = df['condition'].cat.reorder_categories(['Poor', 'Fair', 'Average','Good', 'Very Good'])
df['grade'] = df['grade'].cat.reorder_categories(['2 Substandard', '3 Poor', '4 Low', '5 Fair', 
                                                  '6 Low Average', '7 Average', '8 Good', '9 Better',
                                                  '10 Very Good', '11 Excellent', '12 Luxury', '13 Mansion'])

In [72]:
df['grade'].cat.codes

0        5
1        5
2        5
3        7
4        5
        ..
32718    6
32719    5
32720    5
32721    6
32722    5
Length: 31761, dtype: int8

In [18]:
df['condition'].cat.codes

0        3
1        2
2        2
3        2
4        2
        ..
32718    3
32719    2
32720    2
32721    2
32722    2
Length: 31761, dtype: int8

In [73]:
ord_cat_selector = ['condition', 'grade']
cat_subset = df[ord_cat_selector]
grade_ls = ['2 Substandard', '3 Poor', '4 Low', '5 Fair', 
            '6 Low Average', '7 Average', '8 Good', '9 Better',
            '10 Very Good', '11 Excellent', '12 Luxury', '13 Mansion']
cond_ls = ['Poor', 'Fair', 'Average','Good', 'Very Good']
o_enc = OrdinalEncoder(categories = [cond_ls, grade_ls])
o_enc.fit(cat_subset)

OrdinalEncoder(categories=[['Poor', 'Fair', 'Average', 'Good', 'Very Good'],
                           ['2 Substandard', '3 Poor', '4 Low', '5 Fair',
                            '6 Low Average', '7 Average', '8 Good', '9 Better',
                            '10 Very Good', '11 Excellent', '12 Luxury',
                            '13 Mansion']])

In [74]:
X_subset = pd.DataFrame(o_enc.transform(cat_subset),
                        columns = cat_subset.columns)

In [75]:
o_enc.inverse_transform(X_subset)

array([['Good', '7 Average'],
       ['Average', '7 Average'],
       ['Average', '7 Average'],
       ...,
       ['Average', '7 Average'],
       ['Average', '8 Good'],
       ['Average', '7 Average']], dtype=object)

In [77]:
X_subset = X_subset.rename(columns = {'condition': 'house_cond','grade':'grade2'}) 

In [79]:
df1 = pd.concat([df.reset_index(), X_subset], axis =1)

In [87]:
df1.columns

Index(['index', 'id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'greenbelt', 'nuisance', 'view',
       'condition', 'grade', 'heat_source', 'sewer_system', 'sqft_above',
       'sqft_basement', 'sqft_garage', 'sqft_patio', 'yr_built',
       'yr_renovated', 'address', 'lat', 'long', 'coords', 'zipcodes',
       'ZIPCODE', 'PREFERRED_CITY', 'house_cond', 'grade2'],
      dtype='object')

In [81]:
lr = LinearRegression()
lr

LinearRegression()

In [83]:
col_selector

['house_cond', 'grade2']

In [192]:
# removed id, price, ZIPCODE, address, lat, long
col_selector = ['house_cond', 'grade2', 'sqft_living', 'sqft_above', 'sqft_basement', 'bedrooms', 'yr_built', 'yr_renovated']
X = df1[col_selector]
y = df1['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [193]:
lr.fit(X_train, y_train)

LinearRegression()

In [194]:
lr.score(X_train,y_train)

0.4572658615014117

In [None]:
results.

In [11]:
df['grade'].unique()

array(['7 Average', '9 Better', '8 Good', '6 Low Average', '10 Very Good',
       '5 Fair', '11 Excellent', '12 Luxury', '4 Low', '13 Mansion',
       '3 Poor', '1 Cabin', '2 Substandard'], dtype=object)

In [91]:
df['date'] = pd.to_datetime(df['date'])

In [95]:
# removed id, price, ZIPCODE, address, lat, long
col_selector = ['date','bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'greenbelt', 'nuisance', 'view',
       'condition', 'grade', 'heat_source', 'sewer_system', 'sqft_above',
       'sqft_basement', 'sqft_garage', 'sqft_patio', 'yr_built',
       'yr_renovated', 'zipcodes', 'PREFERRED_CITY']
X = df[col_selector]
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [195]:
print(len(X_train))
print(len(X_test))

22232
9529


In [97]:
lr.fit(X_train, y_train)

ValueError: could not convert string to float: 'NO'