# Overview

# Business Understanding

# Data Understanding

# Data Cleaning

In [128]:
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
#SKlearn
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [98]:
# Load processed data
with open('./data/Data_frame_geoloc.pickle', 'rb') as df_geo_data:
    df_geo = pickle.load(df_geo_data)


In [99]:
# Lets check size of dataframe
df_geo.shape

(21597, 14)

In [144]:
df_geo

Unnamed: 0,id,lat,price,yr_built,sqft_living,sqft_lot,lon,To_drop_place_ID,To_drop_road,Type_place,city,suburb
0,7129300520,47.5112,221900.0,1955,1180,5650,-122.257,159583259.0,61st Avenue South,city,Seattle,
1,6414100192,47.7210,538000.0,1951,2570,7242,-122.319,159668720.0,Northeast 127th Street,city,Seattle,Northgate
2,5631500400,47.7379,180000.0,1933,770,10000,-122.233,74808506.0,81st Avenue Northeast,town,Kenmore,Moorlands
3,2487200875,47.5208,604000.0,1965,1960,5000,-122.393,156392831.0,Fauntleroy Way Southwest,city,Seattle,Fauntleroy
4,1954400510,47.6168,510000.0,1987,1680,8080,-122.045,293729110.0,221st Avenue Northeast,town,Sammamish,
...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,47.6993,360000.0,2009,1530,1131,-122.346,20331941.0,North 97th Street,city,Seattle,Greenwood
21593,6600060120,47.5107,400000.0,2014,2310,5813,-122.362,231509570.0,Southwest 103rd Place,city,Seattle,White Center
21594,1523300141,47.5944,402101.0,2009,1020,1350,-122.299,157689516.0,26th Avenue South,city,Seattle,Leschi
21595,291310100,47.5345,400000.0,2004,1600,2388,-122.069,293824524.0,Northwest Boulder Way Drive,town,Issaquah,


In [100]:
# Investigate states and counties for noise.
print(df_geo["state"].value_counts()) #all recods from Washington state
print(df_geo["county"].value_counts()) # 4 records from different counties. 


Washington    21597
Name: state, dtype: int64
King County         21593
Pierce County           2
Snohomish County        2
Name: county, dtype: int64


In [101]:
#Remove records from other counties
df_geo.drop(df_geo[df_geo.county != "King County"].index, inplace = True)
df_geo.shape
#Drop county and state columns to reduce the number of features
df_geo.drop("state", axis = 1, inplace = True)
df_geo.drop("county", axis = 1, inplace = True)


In [102]:
# Investigate "city" column for missing values
print(f"The number of records with missing cities {sum(df_geo.city.isna())}") #779 records have no cities in it. 
print(f"Missing values are in {round(sum(df_geo.city.isna())/df_geo.shape[0]*100,2)} % of data")

The number of records with missing cities 779
Missing values are in 3.61 % of data


In [103]:
#Remove records with missing cities
df_geo.drop(df_geo[df_geo.city.isna() == True].index, axis = 0, inplace = True)
df_geo.shape

(20814, 12)

In [104]:
# Investigate "Type_place" column for missing values
print(f"The number of records with missing values {sum(df_geo.Type_place.isna())}")

The number of records with missing values 0


In [89]:
# Investigate "suburb" column for missing values
print(f"The number of records with missing values {sum(df_geo.suburb.isna())}")
print(f"Missing \"suburb\"values are in {round(sum(df_geo.suburb.isna())/df_geo.shape[0]*100,2)} % of data")
df_geo.suburb.value_counts() #There is no strong patterns in this data
print("Missing \"suburb\" values for cities are in ", round(sum(df_geo[df_geo.Type_place == "city"].suburb.value_counts())/df_geo[df_geo.Type_place == "city"].shape[0]*100,2), "% of records")
# We won't proceed with this data


The number of records with missing values 11942
Missing "suburb"values are in 57.37 % of data
Missing "suburb" values for cities are in  70.54 % of records


In [149]:
df_geo

Unnamed: 0,id,lat,price,yr_built,sqft_living,sqft_lot,lon,To_drop_place_ID,To_drop_road,Type_place,city,suburb
0,7129300520,47.5112,221900.0,1955,1180,5650,-122.257,159583259.0,61st Avenue South,city,Seattle,
1,6414100192,47.7210,538000.0,1951,2570,7242,-122.319,159668720.0,Northeast 127th Street,city,Seattle,Northgate
2,5631500400,47.7379,180000.0,1933,770,10000,-122.233,74808506.0,81st Avenue Northeast,town,Kenmore,Moorlands
3,2487200875,47.5208,604000.0,1965,1960,5000,-122.393,156392831.0,Fauntleroy Way Southwest,city,Seattle,Fauntleroy
4,1954400510,47.6168,510000.0,1987,1680,8080,-122.045,293729110.0,221st Avenue Northeast,town,Sammamish,
...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,47.6993,360000.0,2009,1530,1131,-122.346,20331941.0,North 97th Street,city,Seattle,Greenwood
21593,6600060120,47.5107,400000.0,2014,2310,5813,-122.362,231509570.0,Southwest 103rd Place,city,Seattle,White Center
21594,1523300141,47.5944,402101.0,2009,1020,1350,-122.299,157689516.0,26th Avenue South,city,Seattle,Leschi
21595,291310100,47.5345,400000.0,2004,1600,2388,-122.069,293824524.0,Northwest Boulder Way Drive,town,Issaquah,


# Data Modeling

In [None]:
# Different type of columns
Geo_columns_basic_all = ["Type_place", "city"]
Geo_columns_basic_type = ["Type_place"]
Geo_columns_basic_city_names = ["city"]
Geo_columns_advanced = ["suburb"]
Geo_columns_drop = ["To_drop_place_ID", "To_drop_road"]

In [139]:
#!!! Setup the type of columns that will be used for modelling
modeling_columns = Geo_columns_basic_all
Geo_columns_drop = [column for column in list(df_geo.columns) if ((column not in modeling_columns)  and (column != "id"))]

#seting up data categorical data for encoding !!!!!! (need to change later on)
X_cat_geo = df_geo.drop(Geo_columns_drop, axis = 1)

Unnamed: 0,id,Type_place,city
0,7129300520,city,Seattle
1,6414100192,city,Seattle
2,5631500400,town,Kenmore
3,2487200875,city,Seattle
4,1954400510,town,Sammamish
...,...,...,...
21592,263000018,city,Seattle
21593,6600060120,city,Seattle
21594,1523300141,city,Seattle
21595,291310100,town,Issaquah


In [151]:
# setup One Hot Encoder 
encoder_geo_basic = OneHotEncoder(handle_unknown = "ignore")
fit_df = X_cat_geo.drop("id", axis = 1)
encoder_geo_basic.fit(fit_df)

OneHotEncoder(handle_unknown='ignore')

In [159]:
# Prepare transformed dataset
X_cat_transf=encoder_geo_basic.transform(fit_df)
X_geo_df = pd.DataFrame(X_cat_transf.todense(), columns = encoder_geo_basic.get_feature_names())
X_geo_df["id"] = X_cat_geo["id"]



In [160]:
X_geo_df

Unnamed: 0,x0_city,x0_town,x0_village,x1_Algona,x1_Auburn,x1_Beaux Arts Village,x1_Bellevue,x1_Black Diamond,x1_Bothell,x1_Burien,...,x1_Seattle,x1_Shoreline,x1_Skykomish,x1_Snoqualmie,x1_Tukwila,x1_Union Hill-Novelty Hill,x1_Vashon,x1_Woodinville,x1_Yarrow Point,id
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.129301e+09
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.414100e+09
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.631500e+09
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.487201e+09
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.954401e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20809,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.554500e+08
20810,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.476201e+09
20811,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.329300e+09
20812,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.570001e+08


# Regression Results

# Conclusion