In [44]:
import pandas as pd
import numpy as np
import plotly.express as px
from statsmodels.stats.outliers_influence import variance_inflation_factor

<b> Previous knowledge: </b>
- Price has strong correlation with num_bath, size_house, year_built, avg_size_neighbor_houses
- num_bed, num_bath, num_floors, is_waterfront, condition are categorical variables
- The df doesn't have null values.
- The number of zeros is low in the df.
- Outliers can't be discarded without business knowledge (most of outliers looks like real information).
- Inliers can't be detected as the data looks reasonable (data don't brake any logic or physical rule).
- Colinearity hasn't been checked
- The data don't have a time dimension (temporal analysis can't be performed)

In [6]:
df = pd.read_csv(".././data/house_sales.csv")

In [24]:
# Group price in 10 quantiles
index_group = 1
df["price_quantile"] = np.nan
for i_quantile in np.linspace(0.1,1,10):
    filter_quantile = (df["price"] >= np.quantile(df["price"], i_quantile-0.1))& (df["price"] <= np.quantile(df["price"], i_quantile))
    df.loc[filter_quantile, "price_quantile"] = index_group
    index_group = index_group+1

## Relationship between main correlated variables and price

In [39]:
col_names = ["num_bath", "size_house", "year_built", "avg_size_neighbor_houses"]
for i_col in col_names:
    fig = px.scatter(df, x=i_col, y="price")
    fig.show()

In [41]:
df = df.sort_values("price_quantile")
for i_col in col_names:
    fig = px.box(df, y=i_col, color = "price_quantile")
    fig.show()

## Relationship between categorical variables and price

In [43]:
col_names = ['num_bed', 'num_bath', 'num_floors', 'is_waterfront', 'condition']
for i_col in col_names:
    df = df.sort_values(i_col)
    fig = px.box(df, y="price", color = i_col)
    fig.show()

## Colinearity analysis

In [50]:
# variance inflation factor
df_temp = df.drop(["price", "price_quantile"], axis =1)
vif_data = pd.DataFrame()
vif_data["feature"] = df_temp.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(df_temp.values, i)
                          for i in range(len(df_temp.columns))]
print(vif_data)

                     feature           VIF
0                    num_bed  2.245863e+01
1                   num_bath  2.839346e+01
2                 size_house  3.139842e+01
3                   size_lot  2.260831e+00
4                 num_floors  1.661399e+01
5              is_waterfront  1.044470e+00
6                  condition  3.458419e+01
7              size_basement  2.677797e+00
8                 year_built  9.046268e+03
9            renovation_date  1.194766e+00
10                       zip  1.593359e+06
11                  latitude  1.345629e+05
12                 longitude  1.350472e+06
13  avg_size_neighbor_houses  2.417710e+01
14     avg_size_neighbor_lot  2.505680e+00


In [51]:
normalized_df=(df_temp-df_temp.mean())/df_temp.std()
vif_data = pd.DataFrame()
vif_data["feature"] = normalized_df.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(normalized_df.values, i)
                          for i in range(len(normalized_df.columns))]
print(vif_data)

                     feature       VIF
0                    num_bed  1.609913
1                   num_bath  3.332022
2                 size_house  5.136044
3                   size_lot  2.004992
4                 num_floors  1.991210
5              is_waterfront  1.037671
6                  condition  1.250221
7              size_basement  1.906437
8                 year_built  2.289709
9            renovation_date  1.148484
10                       zip  1.645631
11                  latitude  1.143178
12                 longitude  1.785075
13  avg_size_neighbor_houses  2.629664
14     avg_size_neighbor_lot  2.044504


## Conclusions

- "num_bath", "size_house", "avg_size_neighbor_houses" exhibit a strong positive linear correlation in the scatter plot. In the box plot is identified for these features that when the quantile number increase the feature distribution also increase.

- When the number of bedrooms increase from 0 to 4, the price distribution increase (shifted up). After 4 bedrooms the distribution looks similar for all the remainder cases.

- The price increase when the number of bathrooms increase.

- The price has the higher distributions when the number of floors are 2 or 2.5

- Price distribution is higher if the house is waterfront.

- Price distribution is lower if the condition is 1 or 2. When the condition is 3, 4 or 5 the price distribution is similar.

- The dataset presents a high collinearity relationship, once the data is normalized the colinearity is mitigated.
