## Preparing notebook

In [111]:
# Imports

import pandas as pd
from scipy import stats
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [56]:
# Importing the DF from local machine
df = 'df_trt.csv'
df = pd.read_csv(df, sep=';')

In [68]:
# Deleting the Unnammed: 0 column
df.drop(columns = 'Unnamed: 0', inplace=True)

In [149]:
# checking the DF
df.shape

(2147, 8)

## Feature Engineering

In [59]:
# Label enconding

## Variables
le = LabelEncoder()

## Fit transform | columns properties and zone
for c in ['property', 'zone']:
  df[c] = le.fit_transform(df[c])


In [70]:
df.head()

Unnamed: 0,area,room,bath,garage,price,ext_area,property,zone
0,30,0,1,1,1400,30,0,2
1,30,3,4,1,16000,230,1,1
2,230,0,1,0,6950,80,0,3
3,230,3,5,2,9950,300,1,2
4,80,0,5,1,100,350,0,2


## Algorithm  | Regression Problem

In [62]:
# Defining function for scenarios

def create_scenarios(features, outlier, scalling, algorithm):
  ''' It combines de list of strings from variables to create 
  differents scenarios to run de model.
  Input: variables separated by coma. Each variable contains a list of strings.
        if there are no situation in some variable, insert 'none'. 
  Output: a variable called 'scenarios' which contains a list of dictionary. '''

  # Creating scenarios/combinations
  global scenarios 
  scenarios = []
  for f in features:
    for o in outlier:
      for s in scalling:
          for a in algorithm:
              scenario = {'features': f,
                          'outlier': o, 
                          'scalling': s,
                          'algorithm': a,
                          }
              scenarios.append(scenario)
  print(f'There are {len(scenarios)} possible scenarios.')

In [150]:
# Defining function for feature engineering and algorithms

def run_model(df, scenario, target):
  '''
  Function to run the scenarios.
  Input: dataframe,
         variable 'scenarios'
  Output: return the scenario and it's metrics.
  '''

  # Running each scenario
  results = []
  dft = df
    
  # Outliers
  if scenario['outlier'] == 'none':
      pass
    
  # Scalling
  if scenario['scalling'] == 'none':
      pass
  elif scenario['scalling'] == 'standard':
      model = StandardScaler()
      model.fit_transform(df[continuous])
  elif scenario['scalling'] == 'robust':
      model = RobustScaler()
      model.fit_transform(df[continuous])

  ## Split train & test
  X = dft.drop(columns = target)
  y = dft[target]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
    
  ## algorithm
  if scenario['algorithm'] == 'linear':
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
  elif scenario['algorithm'] == 'decision_tree':
    model = DecisionTreeRegressor(min_samples_leaf=100) 
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
  elif scenario['algorithm'] == 'svr':
    model = SVR()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
  elif scenario['algorithm'] == 'lasso':
    model = linear_model.Lasso()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
  results.append({'R2:': r2_score(y_test, y_pred),
                  'MSE:': mean_squared_error(y_test, y_pred),
                  'MSRE:': mean_squared_error(y_test, y_pred, squared=False),
                  'MAE:': mean_absolute_error(y_test, y_pred),
                  'MAPE': mean_absolute_percentage_error(y_test, y_pred)})
  return results

In [122]:
# Defining variables to run scenario function
features = ['all']
outlier = ['none']
scalling = ['none', 'standard', 'robust']
algorithm = ['linear', 'decision_tree', 'svr', 'lasso'] 

In [123]:
# Running scenario function
create_scenarios(features, outlier, scalling, algorithm)

There are 12 possible scenarios.


In [90]:
# Defining variables to run feature engineering and algorithms function
continuous = ['area', 'ext_area']
categorical = ['room', 'bath', 'garage', 'zone', 'property']
target = ['price'] 

## Testing scenarios

In [124]:
# SCENARIO 0 

# defining scenario
scenario = scenarios[0]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_0 = run_model(df, scenario, target)
scen_0

{'features': 'all', 'outlier': 'none', 'scalling': 'none', 'algorithm': 'linear'}


[{'R2:': 0.35614203042969284,
  'MSE:': 1593662919.0016327,
  'MSRE:': 39920.70789705053,
  'MAE:': 15657.942185034006,
  'MAPE': 2.3086412890363572}]

In [152]:
# SCENARIO 1

# defining scenario
scenario = scenarios[1]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_1 = run_model(df, scenario, target)
scen_1

{'features': 'all', 'outlier': 'none', 'scalling': 'none', 'algorithm': 'decision_tree'}


[{'R2:': 0.2531910461547797,
  'MSE:': 1848484904.3894105,
  'MSRE:': 42994.010098959254,
  'MAE:': 13887.447834636541,
  'MAPE': 1.0927100772248888}]

In [126]:
# SCENARIO 2 

# defining scenario
scenario = scenarios[2]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_2 = run_model(df, scenario, target)
scen_2

{'features': 'all', 'outlier': 'none', 'scalling': 'none', 'algorithm': 'svr'}


[{'R2:': -0.08134390873664432,
  'MSE:': 2676518380.5862594,
  'MSRE:': 51735.078820721435,
  'MAE:': 16251.61048948741,
  'MAPE': 1.0236373662571732}]

In [127]:
# SCENARIO 3

# defining scenario
scenario = scenarios[3]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_3 = run_model(df, scenario, target)
scen_3

{'features': 'all', 'outlier': 'none', 'scalling': 'none', 'algorithm': 'lasso'}


[{'R2:': 0.35612276078945904,
  'MSE:': 1593710614.786347,
  'MSRE:': 39921.30527408074,
  'MAE:': 15657.483825525995,
  'MAPE': 2.3082998421390033}]

In [128]:
# SCENARIO 4

# defining scenario
scenario = scenarios[4]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_4 = run_model(df, scenario, target)
scen_4

{'features': 'all', 'outlier': 'none', 'scalling': 'standard', 'algorithm': 'linear'}


[{'R2:': 0.35614203042969284,
  'MSE:': 1593662919.0016327,
  'MSRE:': 39920.70789705053,
  'MAE:': 15657.942185034006,
  'MAPE': 2.3086412890363572}]

In [153]:
# SCENARIO 5

# defining scenario
scenario = scenarios[5]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_5 = run_model(df, scenario, target)
scen_5

{'features': 'all', 'outlier': 'none', 'scalling': 'standard', 'algorithm': 'decision_tree'}


[{'R2:': 0.2531910461547797,
  'MSE:': 1848484904.3894105,
  'MSRE:': 42994.010098959254,
  'MAE:': 13887.447834636541,
  'MAPE': 1.0927100772248888}]

In [130]:
# SCENARIO 6 

# defining scenario
scenario = scenarios[6]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_6 = run_model(df, scenario, target)
scen_6

{'features': 'all', 'outlier': 'none', 'scalling': 'standard', 'algorithm': 'svr'}


[{'R2:': -0.08134390873664432,
  'MSE:': 2676518380.5862594,
  'MSRE:': 51735.078820721435,
  'MAE:': 16251.61048948741,
  'MAPE': 1.0236373662571732}]

In [131]:
# SCENARIO 7

# defining scenario
scenario = scenarios[7]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_7 = run_model(df, scenario, target)
scen_7

{'features': 'all', 'outlier': 'none', 'scalling': 'standard', 'algorithm': 'lasso'}


[{'R2:': 0.35612276078945904,
  'MSE:': 1593710614.786347,
  'MSRE:': 39921.30527408074,
  'MAE:': 15657.483825525995,
  'MAPE': 2.3082998421390033}]

In [134]:
# SCENARIO 8

# defining scenario
scenario = scenarios[8]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_8 = run_model(df, scenario, target)
scen_8

{'features': 'all', 'outlier': 'none', 'scalling': 'robust', 'algorithm': 'linear'}


[{'R2:': 0.35614203042969284,
  'MSE:': 1593662919.0016327,
  'MSRE:': 39920.70789705053,
  'MAE:': 15657.942185034006,
  'MAPE': 2.3086412890363572}]

In [155]:
# SCENARIO 9

# defining scenario
scenario = scenarios[9]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_9 = run_model(df, scenario, target)
scen_9

{'features': 'all', 'outlier': 'none', 'scalling': 'robust', 'algorithm': 'decision_tree'}


[{'R2:': 0.2531910461547797,
  'MSE:': 1848484904.3894105,
  'MSRE:': 42994.010098959254,
  'MAE:': 13887.447834636541,
  'MAPE': 1.0927100772248888}]

In [154]:
# SCENARIO 10

# defining scenario
scenario = scenarios[10]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_10 = run_model(df, scenario, target)
scen_10

{'features': 'all', 'outlier': 'none', 'scalling': 'robust', 'algorithm': 'svr'}


[{'R2:': -0.08134390873664432,
  'MSE:': 2676518380.5862594,
  'MSRE:': 51735.078820721435,
  'MAE:': 16251.61048948741,
  'MAPE': 1.0236373662571732}]

In [137]:
# SCENARIO 11

# defining scenario
scenario = scenarios[11]
print(scenario)

# Runnig model function
warnings.filterwarnings("ignore")
scen_11 = run_model(df, scenario, target)
scen_11

{'features': 'all', 'outlier': 'none', 'scalling': 'robust', 'algorithm': 'lasso'}


[{'R2:': 0.35612276078945904,
  'MSE:': 1593710614.786347,
  'MSRE:': 39921.30527408074,
  'MAE:': 15657.483825525995,
  'MAPE': 2.3082998421390033}]

## Conclusion

The better score in this dataset was a R squared of 0.356 with linear regression, independent of scalling or not. It's a really bad prediction result, onde it's correct only about 36% os the sample. 

For improvement of the model:
* Outliers treatment, although decision tree doesn't have a good performance
* Test hyperparameters looking for better fits
* Collect more data to get a bigger sample which can represent better the population
* Get data from other seasons, which will represent better the year data behavior

In [146]:
print("{'features': 'all', 'outlier': 'none', 'scalling': 'none', 'algorithm': 'linear'}")
print(scen_0)
print('-'*30)
print("{'features': 'all', 'outlier': 'none', 'scalling': 'standard', 'algorithm': 'linear'}")
print(scen_4)
print('-'*30)
print("{'features': 'all', 'outlier': 'none', 'scalling': 'robust', 'algorithm': 'linear'}")
print(scen_8)

{'features': 'all', 'outlier': 'none', 'scalling': 'none', 'algorithm': 'linear'}
[{'R2:': 0.35614203042969284, 'MSE:': 1593662919.0016327, 'MSRE:': 39920.70789705053, 'MAE:': 15657.942185034006, 'MAPE': 2.3086412890363572}]
------------------------------
{'features': 'all', 'outlier': 'none', 'scalling': 'standard', 'algorithm': 'linear'}
[{'R2:': 0.35614203042969284, 'MSE:': 1593662919.0016327, 'MSRE:': 39920.70789705053, 'MAE:': 15657.942185034006, 'MAPE': 2.3086412890363572}]
------------------------------
{'features': 'all', 'outlier': 'none', 'scalling': 'robust', 'algorithm': 'linear'}
[{'R2:': 0.35614203042969284, 'MSE:': 1593662919.0016327, 'MSRE:': 39920.70789705053, 'MAE:': 15657.942185034006, 'MAPE': 2.3086412890363572}]
