<a href="https://colab.research.google.com/github/levicristiano/housing_price_colab/blob/main/Price_housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Imports
import numpy as np
import os
import tarfile
import urllib.request
from __future__ import division, print_function, unicode_literals
import pandas as pd
import os
import statistics
from dataclasses import replace
import seaborn as sns
np.random.seed(141)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
### Obtendo os dados
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

In [None]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
df = load_housing_data()
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [None]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [None]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [None]:
total_bedrooms_median = df['total_bedrooms'].median()
df['total_bedrooms'] = df['total_bedrooms'].fillna(total_bedrooms_median) 

df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [None]:
df_dummies = pd.get_dummies(df['ocean_proximity'], drop_first=True)
df = pd.concat([df, df_dummies], axis=1)
df = df.drop(columns=['ocean_proximity'])
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0


In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from timeit import default_timer as timer
from datetime import timedelta

In [None]:
X = df.drop(columns=['median_house_value'])
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=141)

models = dict()

### LinearRegression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
models['LinearRegression'] = model

### RandomForestRegressor

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
models['RandomForestRegressor'] = model

### LGBMRegressor

In [None]:
model = LGBMRegressor()
model.fit(X_train, y_train)
models['LGBMRegressor'] = model

### XGBRegressor

In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)
models['XGBRegressor'] = model



### Comparison

In [None]:
for name, model in models.items():
  pred = model.predict(X_test)
  mae = mean_absolute_error(y_test, pred)
  rmse = np.sqrt(mean_squared_error(y_test, pred))
  mape = mean_absolute_percentage_error(y_test, pred)
  r2 = r2_score(y_test, pred)

  models[name] = {'model': model, 'metrics': {'mae': mae, 'rmse': rmse, 'mape': mape, 'r2': r2}}

print(models)

{'LinearRegression': {'model': LinearRegression(), 'metrics': {'mae': 50126.73104021344, 'rmse': 69541.92619355358, 'mape': 0.2847373582275983, 'r2': 0.6374175180515187}}, 'RandomForestRegressor': {'model': RandomForestRegressor(), 'metrics': {'mae': 31230.568224806204, 'rmse': 48902.41508407522, 'mape': 0.16989262614158368, 'r2': 0.8207025961305614}}, 'LGBMRegressor': {'model': LGBMRegressor(), 'metrics': {'mae': 31619.52484056393, 'rmse': 47757.875722248966, 'mape': 0.1726326244343133, 'r2': 0.8289971344236395}}, 'XGBRegressor': {'model': XGBRegressor(), 'metrics': {'mae': 38061.36655235586, 'rmse': 54797.93676097733, 'mape': 0.20953342393336366, 'r2': 0.7748656308717691}}}


In [None]:
df_metrics = pd.DataFrame(columns=['mae', 'rmse', 'mape', 'r2'])

for key, value in models.items():
  df_metrics.loc[key] = value['metrics'].values()

df_metrics

Unnamed: 0,mae,rmse,mape,r2
LinearRegression,50126.73104,69541.926194,0.284737,0.637418
RandomForestRegressor,31230.568225,48902.415084,0.169893,0.820703
LGBMRegressor,31619.524841,47757.875722,0.172633,0.828997
XGBRegressor,38061.366552,54797.936761,0.209533,0.774866
