In [34]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
df = pd.read_csv('data/2022-11-20_DataCleaned.csv',index_col=0)

## Graphs

In [None]:
graph1 = px.box(df,x='rooms',y='price',color='rooms',template='plotly_dark',title=' Sales Price per different rooms')
graph1

In [None]:
graph2=px.box(df,x='floor',y='price',color='floor',template='plotly_dark',title=' Sales Price per different floor')
graph2

In [None]:
graph3= px.box(df,x='bedrooms',y='price',color='bedrooms',template='plotly_dark',title=' Sales Price per different bedrooms')
graph3

In [None]:
graph4= px.scatter(df,x='area',y='price',color='city',size='area',template='plotly_dark')
graph4

# Model

In [76]:
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, OrdinalEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_predict
from sklearn.pipeline import make_pipeline
import numpy as np

In [77]:
target_column = 'price'
DROP_COLS = ['address']

In [78]:
categorical_cols = [c for c in df.columns.drop(DROP_COLS) if df[c].dtype in [object]]
numerical_cols = [c for c in df.columns.drop(DROP_COLS) if df[c].dtype in [float, int] and c not in [target_column]]
print("Categorical columns are: "+ ", ".join(categorical_cols))
print("Numerical columns are: "+ ", ".join(numerical_cols))

Categorical columns are: energy_label, maakond, city, district
Numerical columns are: rooms, bedrooms, area, floor, total_floors, year, flag_balcony, flag_lift, flag_shower, flag_bath, flag_sauna, flag_generalheat, flag_floorheat, flag_gasheat, flag_electricalheat, flag_brick, flag_panel, flag_beton, flag_stone, flag_wood, flag_newbuilding, flag_indevelopment, flag_renovated, flag_needsrenovation, zipcode, lat, lng


In [79]:
X = df[numerical_cols+categorical_cols]
y = df[target_column]

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
def cv(model, params):
    grid = GridSearchCV(model, params, scoring='neg_root_mean_squared_error', cv=5)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    return(grid.best_estimator_)

In [82]:
lgbm = LGBMRegressor()
params = {}
params['lgbmregressor__num_leaves'] = [7, 14, 21, 28, 31, 50]
params['lgbmregressor__max_depth'] = [-1, 3, 5, 8]
params['lgbmregressor__learning_rate'] = [0.15,0.1, 0.01, 0.001]
params['lgbmregressor__boosting_type'] = ['gbdt','dart','goss']


model = make_pipeline(ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),    
    ]),
    lgbm)

In [83]:
best_model = cv(model,params)
best_model.fit(X_train, y_train)
preds = best_model.predict(X_test)
print("The RMSFE error is:", round(mean_squared_error(y_test,preds)**(1/2),2))
print("The R2 is:", round(r2_score(y_test, preds),2))

{'lgbmregressor__boosting_type': 'goss', 'lgbmregressor__learning_rate': 0.15, 'lgbmregressor__max_depth': 8, 'lgbmregressor__num_leaves': 31}
The RMSFE error is: 36649.88
The R2 is: 0.91


In [84]:
rf = RandomForestRegressor()
params = {}
params['randomforestregressor__max_depth'] = [5,15,30]
params['randomforestregressor__n_estimators'] = [10,50,100,150]
params['randomforestregressor__min_samples_split'] = [2,4,8]
params['randomforestregressor__max_features'] = ["sqrt", "log2"]
params['randomforestregressor__bootstrap'] = [True, False]

model = make_pipeline(ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),    
    ]),rf
    )

In [85]:
best_model = cv(model,params)
best_model.fit(X_train, y_train)
preds = best_model.predict(X_test)


{'randomforestregressor__bootstrap': False, 'randomforestregressor__max_depth': 30, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 150}


In [86]:
print("The RMSFE error is:", round(mean_squared_error(y_test,preds)**(1/2),2))
print("The R2 is:", round(r2_score(y_test, preds),2))

The RMSFE error is: 35208.27
The R2 is: 0.92


In [87]:
X_train_lm = X_train.drop(['lat','lng'],axis=1)
X_test_lm = X_test.drop(['lat','lng'],axis=1)
numerical_cols.remove('lng')
numerical_cols.remove('lat')

In [88]:
categorical_cols

['energy_label', 'maakond', 'city', 'district']

In [89]:
lm = LinearRegression()
model = make_pipeline(ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),    
    ]),lm
    )
model.fit(X_train_lm,y_train)
preds = model.predict(X_test_lm)
print("The RMSFE error is:", round(mean_squared_error(y_test,preds)**(1/2),2))
print("The R2 is:", round(r2_score(y_test, preds),2))

The RMSFE error is: 50382.48
The R2 is: 0.83
