## Preparing notebook

In [10]:
# Imports

import pandas as pd
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [5]:
# Importing the DF from local machine
df = 'df_trt.csv'
df = pd.read_csv(df, sep=';')

In [6]:
# checking the DF
df.head()

Unnamed: 0.1,Unnamed: 0,area,room,bath,garage,price,properties,zone
0,0,30,0,1,1,1400,comercial,centro
1,1,30,3,4,1,16000,residencial,leste
2,2,230,0,1,0,6950,comercial,norte
3,3,230,3,5,2,9950,residencial,centro
4,4,80,0,5,1,100,comercial,centro


## Feature Engineering

In [7]:
# Deleted column Unnamed
df.drop(columns='Unnamed: 0', inplace=True)

In [8]:
# Transforming the properties and zone into label columns


Unnamed: 0,area,room,bath,garage,price,properties,zone
0,30,0,1,1,1400,comercial,centro
1,30,3,4,1,16000,residencial,leste
2,230,0,1,0,6950,comercial,norte
3,230,3,5,2,9950,residencial,centro
4,80,0,5,1,100,comercial,centro


In [11]:
# Label enconding

## Variables
le = LabelEncoder()

## Fit transform | columns properties and zone
for c in ['properties', 'zone']:
  df[c] = le.fit_transform(df[c])


In [42]:
df.head()

Unnamed: 0,area,room,bath,garage,price,properties,zone
0,30,0,1,1,1400,0,0
1,30,3,4,1,16000,1,2
2,230,0,1,0,6950,0,3
3,230,3,5,2,9950,1,0
4,80,0,5,1,100,0,0


## Algorithm  | Regression Problem

In [129]:
from sklearn.preprocessing import StandardScaler

In [98]:
# Defining function for scenarios

def create_scenarios(features, outlier, scalling, rebalance, algorithm):
  ''' It combines de list of strings from variables to create 
  differents scenarios to run de model.
  Input: variables separated by coma. Each variable contains a list of strings.
        if there are no situation in some variable, insert 'none'. 
  Output: a variable called 'scenarios' which contains a list of dictionary. '''

  # Creating scenarios/combinations
  global scenarios 
  scenarios = []
  for f in features:
    for o in outlier:
      for s in scalling:
        for r in rebalance:
          for a in algorithm:
              scenario = {'features': f,
                          'outlier': o, 
                          'scalling': s,
                          'rebalance': r,
                          'algorithm': a,
                          }
              scenarios.append(scenario)
  print(f'There are {len(scenarios)} possible scenarios.')

In [136]:
# Defining function for feature engineering and algorithms

def run_model(df, scenarios, target):
  '''
  Function to run the scenarios.
  Input: dataframe,
         variable 'scenarios'
  Output: return the scenario and it's metrics.
  '''

  # Running each scenario
  for scenario in scenarios: 
    print(scenario)
    dft = df.copy()
    ## Split train & test0
    X = dft.drop(columns = target)
    y = dft[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    ## outliers
    if scenario['outlier'] == 'none':
      pass
    #  print('Data without Outliers treatment')
    else:
        print('Passed the outliers')
    ## scalling  
    if scenario['scalling'] == 'none':
    #  print('Data without scalling treatment')
        pass
    elif scenario['scalling'] == 'standard':
        model = StandardScaler()
        model.fit_transform(X_train, y_train)
        model.fit_transform(X_test, y_test)
        
    else:
      print('Passed the scalling')
    ## rebalance
    if scenario['rebalance'] == 'none':
    #  print('Data without rebalance')
      pass
    else:
      print('Passed the rebalance')
    ## algorithm
    if scenario['algorithm'] == 'linear':
      model = LinearRegression()
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
    #  print('LN')
    elif scenario['algorithm'] == 'decision_tree':
      model = DecisionTreeRegressor(k_neigh) 
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
    #  print('DTR')
    elif scenario['algorithm'] == 'svr':
      model = SVR()
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
    #  print('SVR')
    elif scenario['algorithm'] == 'lasso':
      model = linear_model.Lasso()
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
    #  print('Lasso')
    else:
      print('Passed the algorithm')
    global results
    results = [{'R2:': r2_score(y_test, y_pred),
                'MSE:': mean_squared_error(y_test, y_pred),
                'MSRE:': mean_squared_error(y_test, y_pred, squared=False),
                'MAE:': mean_absolute_error(y_test, y_pred)}]
    print(results)
    print('='*40)

In [131]:
# Defining variables to run scenario function
features = ['all']
outlier = ['none']
scalling = ['none', 'standard']
rebalance = ['none']
algorithm = ['linear', 'svr', 'lasso'] 

In [132]:
# Running scenario function
create_scenarios(features, outlier, scalling, rebalance, algorithm)

There are 6 possible scenarios.


In [121]:
# Defining variables to run feature engineering and algorithms function
continuous = ['area', 'price']
categorical = ['room', 'bath', 'garage', 'zone', 'price']
target = ['price'] 

In [137]:
# Runnig model function
run_model(df, scenarios, target)

{'features': 'all', 'outlier': 'none', 'scalling': 'none', 'rebalance': 'none', 'algorithm': 'linear'}
[{'R2:': 0.20456427992492632, 'MSE:': 1537453657.8601656, 'MSRE:': 39210.37691555853, 'MAE:': 15354.286312133152}]
{'features': 'all', 'outlier': 'none', 'scalling': 'none', 'rebalance': 'none', 'algorithm': 'svr'}


  y = column_or_1d(y, warn=True)


[{'R2:': -0.15973242934673126, 'MSE:': 485628557.9447565, 'MSRE:': 22036.981597867627, 'MAE:': 10048.813323716715}]
{'features': 'all', 'outlier': 'none', 'scalling': 'none', 'rebalance': 'none', 'algorithm': 'lasso'}
[{'R2:': 0.19836031515738495, 'MSE:': 1102470408.1681056, 'MSRE:': 33203.46982121154, 'MAE:': 15083.029540206478}]
{'features': 'all', 'outlier': 'none', 'scalling': 'standard', 'rebalance': 'none', 'algorithm': 'linear'}
[{'R2:': 0.19667113305871942, 'MSE:': 913198775.3540604, 'MSRE:': 30219.178932493523, 'MAE:': 14268.726256164753}]
{'features': 'all', 'outlier': 'none', 'scalling': 'standard', 'rebalance': 'none', 'algorithm': 'svr'}


  y = column_or_1d(y, warn=True)


[{'R2:': -0.0905897632004895, 'MSE:': 1192223934.5510821, 'MSRE:': 34528.59589602627, 'MAE:': 12326.712762210094}]
{'features': 'all', 'outlier': 'none', 'scalling': 'standard', 'rebalance': 'none', 'algorithm': 'lasso'}
[{'R2:': 0.3122007832968866, 'MSE:': 944582549.2309554, 'MSRE:': 30734.061710599777, 'MAE:': 14345.373087986623}]
