In [2]:
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
import joblib

In [16]:
input_file = "../scraping/realestate.csv"
df = pd.read_csv(input_file)

In [17]:
df.head()

Unnamed: 0,description,price,rooms,area,location,floor
0,BATTALGAZİDE 4+1 GÜNEYBATI CEPHE GİRİŞ FİYA...,1.850.000 TL,weekend4+1,texture155 m2,Ankara - Altındağ - Battalgazi Mh.,layers3. Kat
1,Yapracık Atayurt Mahallesi Satılık 2+1 Bağıms...,2.790.000 TL,weekend2+1,texture95 m2,Ankara - Etimesgut - Atayurt Mh.,layers1. Kat
2,EMEKTE 2+1 YENİ LÜKS ÖN BAHÇE KATI,3.750.000 TL,weekend2+1,texture90 m2,Ankara - Çankaya - Emek Mh.,layersBahçe katı
3,ETİMESGUT'TA BAĞLICA 2 DE YENİ BİNADA PARK MA...,2.579.000 TL,weekend3+1,texture125 m2,Ankara - Etimesgut - Alsancak Mh.,layers1. Kat
4,FATİH MEVLANA MAH. YILDIZ EMLAKTAN 3+1 ASANSÖ...,1.589.000 TL,weekend3+1,texture125 m2,Ankara - Sincan - Mevlana Mh.,layersYüksek giriş


In [18]:
#preprocessing
df['area'] = df['area'].str.extract(r'(\d+)')
df['price'] = df['price'].astype(str).replace(r'[^\d.]', '', regex=True).str.replace('.', '').astype(float)
df['rooms'] = df['rooms'].str.extract(r'(\d+)')
df = df[df['floor'].str.contains('layers', na=False)].assign(floor=lambda x: x['floor'].str.extract(r'(\d+)').fillna(0).astype(int))
df['location'] = df['location'].apply(lambda x: x.split(' - ')[1])
df = df.dropna()

output_file = 'realestate_preprocessed.csv'

df.to_csv(output_file, index=False)

In [19]:
df.head()

Unnamed: 0,description,price,rooms,area,location,floor
0,BATTALGAZİDE 4+1 GÜNEYBATI CEPHE GİRİŞ FİYA...,1850000.0,4,155,Altındağ,3
1,Yapracık Atayurt Mahallesi Satılık 2+1 Bağıms...,2790000.0,2,95,Etimesgut,1
2,EMEKTE 2+1 YENİ LÜKS ÖN BAHÇE KATI,3750000.0,2,90,Çankaya,0
3,ETİMESGUT'TA BAĞLICA 2 DE YENİ BİNADA PARK MA...,2579000.0,3,125,Etimesgut,1
4,FATİH MEVLANA MAH. YILDIZ EMLAKTAN 3+1 ASANSÖ...,1589000.0,3,125,Sincan,0


In [20]:
#encoding
label_encoder = LabelEncoder()
df['location'] = label_encoder.fit_transform(df['location'])

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_mapping_df = pd.DataFrame(list(label_mapping.items()), columns=['location', 'Encoded_Value'])
label_mapping_df.to_csv('label_mapping.csv', index=False)

df[['location', 'location']]
df.to_csv('realestate_encoded.csv', index=False)

In [21]:
df.head()

Unnamed: 0,description,price,rooms,area,location,floor
0,BATTALGAZİDE 4+1 GÜNEYBATI CEPHE GİRİŞ FİYA...,1850000.0,4,155,0,3
1,Yapracık Atayurt Mahallesi Satılık 2+1 Bağıms...,2790000.0,2,95,2,1
2,EMEKTE 2+1 YENİ LÜKS ÖN BAHÇE KATI,3750000.0,2,90,10,0
3,ETİMESGUT'TA BAĞLICA 2 DE YENİ BİNADA PARK MA...,2579000.0,3,125,2,1
4,FATİH MEVLANA MAH. YILDIZ EMLAKTAN 3+1 ASANSÖ...,1589000.0,3,125,8,0


In [4]:
#training models
dataset = pd.read_csv('realestate_encoded.csv')
X = dataset[['rooms', 'area', 'floor', 'location']]
y = dataset['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.19, random_state=9)

In [5]:
#gradient boosting regression
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, random_state=0) 
gb_regressor.fit(X_train, y_train)
y_pred_gb = gb_regressor.predict(X_test)
gb_r2_score = r2_score(y_test, y_pred_gb)
print(f'Gradient Boosting Regression R-squared Score: {gb_r2_score}')

joblib.dump(gb_regressor, 'model.pkl')

Gradient Boosting Regression R-squared Score: 0.5709159943997233


['model.pkl']

In [6]:
#decision tree regression
dt_regressor = DecisionTreeRegressor(random_state = 0)
dt_regressor.fit(X_train, y_train)
y_predDT = dt_regressor.predict(X_test)
dt_r2_score = r2_score(y_test, y_predDT)
print(f'Decision Tree Regression R-squared Score: {dt_r2_score}')

Decision Tree Regression R-squared Score: 0.5339916365803159


In [7]:
#lasso regression
lasso_regressor = Lasso(alpha=1.0, random_state=0)
lasso_regressor.fit(X_train, y_train)
y_pred_lasso = lasso_regressor.predict(X_test)
lasso_r2_score = r2_score(y_test, y_pred_lasso)
print(f'Lasso Regression R-squared Score: {lasso_r2_score}')

Lasso Regression R-squared Score: 0.4806034112297699


In [8]:
#multiple linear regression
ml_regressor = LinearRegression()
ml_regressor.fit(X_train, y_train)
y_pred = ml_regressor.predict(X_test)
ml_r2_score = r2_score(y_test, y_pred) 
print(f'Multiple Linear Regression R-squared Score: {ml_r2_score}')

Multiple Linear Regression R-squared Score: 0.4806034648026333


In [9]:
#random forest regression
rf_regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
rf_regressor.fit(X_train, y_train)
y_predRF = rf_regressor.predict(X_test)
rf_r2_score = r2_score(y_test, y_predRF)
print(f'Random Forest Regression R-squared Score: {rf_r2_score}')

Random Forest Regression R-squared Score: 0.5756006870132004


In [10]:
#ridge regression
ridge_regressor = Ridge(alpha=1.0)
ridge_regressor.fit(X_train, y_train)
y_pred_ridge = ridge_regressor.predict(X_test)
ridge_r2_score = r2_score(y_test, y_pred_ridge)
print(f'Ridge Regression R-squared Score: {ridge_r2_score}')

Ridge Regression R-squared Score: 0.4805861929311609


In [11]:
#support vector regression
svr_regressor = SVR(kernel = 'rbf',degree=3)
svr_regressor.fit(X_train, y_train)
y_pred = svr_regressor.predict(X_test)
svr_r2_score = r2_score(y_test, y_pred)
print(f'SVR R-squared Score: {svr_r2_score}')

SVR R-squared Score: -0.0638975850334762
