## Preparing the notebook

In [1]:
!pip install category-encoders



In [2]:
# imports
import pandas as pd
from scipy import stats

from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import pickle as pk
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

## Scalers in pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

## ML code

In [45]:
# Importing the DF from local machine
df = pd.read_csv('df_trt.csv', sep=';')

In [46]:
df.head()

Unnamed: 0.1,Unnamed: 0,area,room,bath,garage,price,ext_area,property,zone
0,0,30,0,1,1,1400,30,commercial,midtown
1,1,30,3,4,1,16000,230,residential,east
2,2,230,0,1,0,6950,80,commercial,north
3,3,230,3,5,2,9950,300,residential,midtown
4,4,80,0,5,1,100,350,commercial,midtown


In [73]:
df.drop(columns='Unnamed: 0', inplace=True)

In [None]:
df.drop(columns='area', inplace=True)

In [102]:
df.rename(columns={'ext_area':'area'}, inplace=True)

## Creating the pipeline

In [103]:
## Split train & test
train, test = train_test_split(df.dropna(), test_size=0.2, random_state=42)
X_train, y_train = train.drop(columns="price"), train["price"]
X_test, y_test = test.drop(columns="price"), test["price"]

In [104]:
train

Unnamed: 0,room,bath,garage,price,area,property,zone
1433,3,2,2,5500,120,residential,midtown
630,0,1,0,1150,52,commercial,midtown
78,1,3,0,7000,166,residential,midtown
366,4,4,2,7000,140,residential,midtown
1994,2,2,1,2800,69,residential,midtown
...,...,...,...,...,...,...,...
1638,0,1,1,2000,32,commercial,midtown
1095,0,1,3,2300,82,commercial,continental
1130,4,5,6,120000,437,residential,north
1294,0,1,0,1630,126,commercial,continental


In [105]:
# Scalling & encoder
standard = StandardScaler()
label = OrdinalEncoder()

In [106]:
# Defining the models used
linear = LinearRegression()

In [107]:
# Defining the preprocess
preprocess = ColumnTransformer(
                [
                    ('scalling', standard, ['area']),
                    ('encoder', label, ['property', 'zone'])
                ], remainder = 'passthrough')


In [108]:
# checking the preprocess
lr_ft = preprocess.fit_transform(X_train)

In [109]:
# Defining the models and returning the original scale
model = TransformedTargetRegressor(regressor= LinearRegression(), transformer= standard) 

In [110]:
# Creating the pipeline
pipe = Pipeline([('pre', preprocess), 
                 ('line', model)
                ])

In [111]:
# Fitting the train
pipe.fit(X_train, y_train)

Pipeline(steps=[('pre',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scalling', StandardScaler(),
                                                  ['area']),
                                                 ('encoder', OrdinalEncoder(),
                                                  ['property', 'zone'])])),
                ('line',
                 TransformedTargetRegressor(regressor=LinearRegression(),
                                            transformer=StandardScaler()))])

In [112]:
y_pred_train = pipe.predict(X_train)

In [113]:
y_pred_train

array([15201.49606091,  1958.00323176, 12406.78409954, ...,
       43373.03293458,  5442.42237398, 24355.33281683])

## Creating pickles

In [114]:
pk.dump(pipe, open('model_pipe.pkl', 'wb'))