# Regression Models

### Import needed modules and methods

In [428]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

import pickle
import random

### Define Final Results Dataframe

In [429]:
df_models_scores = pd.DataFrame(columns=['Model', 'Score'])

### read dataset

In [430]:
df = pd.read_csv("kc_house_data.csv")
df.head(10)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.511,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.738,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.521,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.617,-122.045,1800,7503
5,7237550310,20140512T000000,1230000.0,4,4.5,5420,101930,1.0,0,0,...,11,3890,1530,2001,0,98053,47.656,-122.005,4760,101930
6,1321400060,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,...,7,1715,0,1995,0,98003,47.31,-122.327,2238,6819
7,2008000270,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,...,7,1060,0,1963,0,98198,47.41,-122.315,1650,9711
8,2414600126,20150415T000000,229500.0,3,1.0,1780,7470,1.0,0,0,...,7,1050,730,1960,0,98146,47.512,-122.337,1780,8113
9,3793500160,20150312T000000,323000.0,3,2.5,1890,6560,2.0,0,0,...,7,1890,0,2003,0,98038,47.368,-122.031,2390,7570


### take a look on the data

In [431]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [432]:
df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580301520.865,540182.159,3.371,2.115,2079.9,15106.968,1.494,0.008,0.234,3.409,7.657,1788.391,291.509,1971.005,84.402,98077.94,47.56,-122.214,1986.552,12768.456
std,2876565571.312,367362.232,0.93,0.77,918.441,41420.512,0.54,0.087,0.766,0.651,1.175,828.091,442.575,29.373,401.679,53.505,0.139,0.141,685.391,27304.18
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.156,-122.519,399.0,651.0
25%,2123049194.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930410.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.572,-122.23,1840.0,7620.0
75%,7308900445.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000190.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.778,-121.315,6210.0,871200.0


In [433]:
df = df[(df['bedrooms'] < 10) & (df['bathrooms'] < 10)]

### Splitting Data into train, validate, and test

create X & y

In [434]:
X = df[['sqft_above', 'sqft_living', 'bedrooms', 'bathrooms', 'yr_built', 'zipcode', 'view', 'condition']]
y = df['price']

split into train / others

In [435]:
X_train, X_other, y_train, y_other = train_test_split(X, y, train_size=0.8, random_state=42, shuffle=True)

In [436]:
X_validate, X_test, y_validate, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=42, shuffle=True)

- train set will be used to train all models
- validate set will be used to evaludate each model, and choose the best one
- test set will be used only once at the end, for the final model chosen

### Create Models

In [437]:
models = {
    'Multiple Linear Regression': LinearRegression(),
    'KNeighborsRegressor': KNeighborsRegressor(n_neighbors=20),
    'XGBRegressor': XGBRegressor(),
    'RandomForestRegressor': RandomForestRegressor(n_estimators=20, random_state=random.randint(10, 100)),
    'DecisionTreeRegressor': DecisionTreeRegressor()
}

### Train, Validate, Save scores

In [438]:
for model_name, model in models.items():
    # train
    model.fit(X_train, y_train)
    # predict
    y_pred = model.predict(X_validate)
    # calculate coefficient of determination
    curr_score = r2_score(y_true=y_validate, y_pred=y_pred)
    # create a temp dataframe holding curr model result
    curr_model_result = pd.DataFrame([{'Model':model_name, 'Score':curr_score}])
    # add curr result to models_results dataframe
    df_models_scores = pd.concat([df_models_scores, curr_model_result], ignore_index=True)
        

  df_models_scores = pd.concat([df_models_scores, curr_model_result], ignore_index=True)


### Final Results

#### All models results

In [439]:
# avoid showing scientific notation or more than 3 decimal places
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df_models_scores

Unnamed: 0,Model,Score
0,Multiple Linear Regression,0.595
1,KNeighborsRegressor,0.598
2,XGBRegressor,0.8
3,RandomForestRegressor,0.776
4,DecisionTreeRegressor,0.612


save results to text file

In [440]:
with open("models_scores.txt", "w") as file:
    file.write(df_models_scores.to_string())

#### Best Performing Model

In [441]:
# get the index of the best model using argmax
best_model_result = df_models_scores.iloc[df_models_scores['Score'].argmax()]
best_model_result

Model    XGBRegressor
Score           0.800
Name: 2, dtype: object

### Testing

In [442]:
# get best model & its name
best_model_name = best_model_result['Model']
best_model = models[best_model_name]

In [443]:
# test best model on unseen testing set
y_pred = best_model.predict(X_test)
final_score = r2_score(y_true=y_test, y_pred=y_pred)
print(f"Best Model:\n{best_model_name} with a coefficient of determination {final_score:.3f}")

Best Model:
XGBRegressor with a coefficient of determination 0.787


### Save Best Model

In [444]:
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)