In [66]:
import pandas as pd
import string as str
from google.colab import drive
drive.mount('/content/drive')
shapefile = '/content/drive/MyDrive/Colab Notebooks/2012_Metropolitan_King_County_Council_Districts___kccdst_area_2012.shp'

housing = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [67]:
housing.columns #70 zip codes

Index(['id', 'date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'price'],
      dtype='object')

In [68]:
import geopandas as gpd
from scipy.spatial import distance

regions = [
  {
    "name": "Bellevue",
    "center": (47.4419, -122.1577),
  },
  {
    "name": "Redmond",
    "center": (47.6062, -122.0837),
  },
  {
    "name": "Kirkland",
    "center": (47.6427, -122.2072),
  },
  {
    "name": "Issaquah",
    "center": (47.5133, -122.1250),
  },
  {
    "name": "Bothell",
    "center": (47.7000, -122.2250),
  },
  {
    "name": "North Seattle",
    "center": (47.6250, -122.3500),
  },
  {
    "name": "Woodinville",
    "center": (47.7500, -122.1750),
  },
  {
    "name": "Sammamish",
    "center": (47.6875, -122.0000),
  },
  {
    "name": "Mercer Island",
    "center": (47.5833, -122.2500),
  },
  {
    "name": "Shoreline",
    "center": (47.5625, -122.3750),
  },
  {
    "name": "Auburn",
    "center": (47.3750, -122.2500),
  },
  {
    "name": "Kent",
    "center": (47.2500, -122.3250),
  },
  {
    "name": "Federal Way",
    "center": (47.2167, -122.3000),
  },
  {
    "name": "Des Moines",
    "center": (47.3750, -122.2000),
  },
  {
    "name": "SeaTac",
    "center": (47.4167, -122.3167),
  },
]

def get_closest_region(x, y):
    centers = [(region["center"][0], region["center"][1]) for region in regions]
    distances = distance.cdist([(x, y)], centers, 'euclidean')[0]
    closest_index = distances.argmin()
    return regions[closest_index]["name"]

# Assuming your dataset is stored in a DataFrame called 'df' with columns 'lat' and 'long'
housing["region"] = housing.apply(lambda row: get_closest_region(row["lat"], row["long"]), axis=1)

housing["region"].value_counts()

North Seattle    3416
Bothell          1917
Bellevue         1750
Redmond          1697
Shoreline        1579
Mercer Island    1552
Sammamish        1476
Des Moines       1335
Issaquah         1326
SeaTac           1059
Woodinville       953
Kirkland          736
Kent              688
Auburn            402
Federal Way       114
Name: region, dtype: int64

In [69]:
adjustment_percentages = {
    "Bellevue": 18.2,
    "Redmond": 17.8,
    "Kirkland": 17.5,
    "Issaquah": 17.2,
    "Bothell": 16.9,
    "North Seattle": 16.7,
    "Woodinville": 16.6,
    "Sammamish": 16.5,
    "Mercer Island": 16.4,
    "Shoreline": 16.3,
    "Auburn": 12.3,
    "Kent": 12.2,
    "Federal Way": 12.1,
    "Des Moines": 12.4,
    "SeaTac": 12.5
}

def calculate_adjusted_price(row):
    region = row['region'] 
    price = row['price'] 
    date = pd.to_datetime(row['date'], format='%Y%m%dT%H%M%S')  # Convert the date to datetime format
    
    # Get the adjustment percentage for the region
    adjustment_percentage = adjustment_percentages.get(region)

    # Calculate the number of days between the row's date and May 1st, 2014
    start_date = pd.Timestamp('2014-05-01')
    days_diff = (date - start_date).days

    # Calculate the adjustment based on the percentage and the number of days
    adjustment = (adjustment_percentage / 365) * days_diff

    # Calculate the adjusted price
    adjusted_price = price - (price * (adjustment / 100))

    return adjusted_price

def reverse_adjusted_price(adjusted_price, row):
    region = row['region']  # Assuming you have a 'region' column in your dataset
    date = pd.to_datetime(row['date'], format='%Y%m%dT%H%M%S')  # Convert the date to datetime format
    
    # Get the adjustment percentage for the region
    adjustment_percentage = adjustment_percentages.get(region)

    # Calculate the number of days between the row's date and May 1st, 2014
    start_date = pd.Timestamp('2014-05-01')
    days_diff = (date - start_date).days

    # Calculate the adjustment based on the percentage and the number of days
    adjustment = (adjustment_percentage / 365) * days_diff

    # Reverse the adjustment by adding it back to the adjusted price
    price = adjusted_price + (adjusted_price * (adjustment / 100))

    return price

# Apply the calculate_adjusted_price function to create the 'price_adjusted' column
housing['price_adjusted'] = housing.apply(calculate_adjusted_price, axis=1)
housing.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,region,price_adjusted
0,1565930130,20141104T000000,4,3.25,3760,4675,2.0,0,0,3,...,2007,0,98038,47.3862,-122.048,3280,4033,429900.0,Bellevue,389814.475068
1,3279000420,20150115T000000,3,1.75,1460,7800,1.0,0,0,2,...,1979,0,98023,47.3035,-122.382,1310,7865,233000.0,Kent,212829.221918
2,194000575,20141014T000000,4,1.0,1340,5800,1.5,0,2,3,...,1914,0,98116,47.5658,-122.389,1900,5800,455000.0,Shoreline,421270.164384
3,2115510160,20141208T000000,3,1.75,1440,8050,1.0,0,0,3,...,1985,0,98023,47.3187,-122.39,1790,7488,258950.0,Kent,239821.753699
4,7522500005,20140815T000000,2,1.5,1780,4750,1.0,0,0,4,...,1947,0,98117,47.6859,-122.395,1690,5962,555000.0,North Seattle,528083.260274


In [72]:
X = housing.drop('price', axis=1)
y = housing['price']
X_set, X_validation, y_set, y_validation = train_test_split(X, y, test_size=0.1, random_state=42)
X_set['price'] = y_set
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler()
X_set = housing.drop('region', axis=1)
y_set = housing['region']
X_resampled, y_resampled = oversampler.fit_resample(X_set, y_set)
X_resampled['region'] = y_resampled

y_set_resampled = X_resampled['price_adjusted']
X_set_resampled = X_resampled.drop('price',axis=1)
features =  ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'sqft_living15', 'region', 'sqft_lot15']

X_train, X_test, Y_train, Y_test = train_test_split(X_set_resampled, y_set_resampled, test_size=0.11, random_state=42)

# Perform one-hot encoding
X_train = pd.get_dummies(X_train[features], drop_first=True)
X_test_no_encoding = X_test.copy()
X_test = pd.get_dummies(X_test[features], drop_first=True)
X_val_no_encoding = X_validation.copy()
X_validation = pd.get_dummies(X_validation[features], drop_first=True)

#Fix date
model = XGBRegressor()
model.fit(X_train, Y_train)
test_predictions = model.predict(X_test)
validation_predictions = model.predict(X_validation)

# Create a new column 'predicted_price' in the 'X_test' dataframe
X_test_no_encoding['predicted_price'] = test_predictions
X_val_no_encoding['predicted_price'] = validation_predictions

# Apply the reverse adjustment to the predicted prices
y_test_adjusted = X_test_no_encoding.apply(lambda row: reverse_adjusted_price(row['predicted_price'], row), axis=1)
y_val_adjusted = X_val_no_encoding.apply(lambda row: reverse_adjusted_price(row['predicted_price'], row), axis=1)

test_scoring = y_test_adjusted - test_predictions
val_scoring = y_val_adjusted - validation_predictions

In [74]:
from sklearn.metrics import mean_squared_error

result = mean_squared_error(Y_test, test_predictions, squared=False)
print("Test:",result)
result = mean_squared_error(y_validation, validation_predictions, squared=False)
print("Validation:",result)

Test: 65924.20326951097
Validation: 97250.4999744432


In [77]:
from sklearn.metrics import r2_score
print("Test R2:",r2_score(Y_test,test_predictions))
print("Validation R2:",r2_score(y_validation,validation_predictions))

Test R2: 0.9635369653967351
Validation R2: 0.9418650651552944
