In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('ultimate_df.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,floor_category,price
0,0.0,80.0,3.0,3.0,3.0,3.0,1527.0,1.0,0.0,0.0,2.0,1.55
1,0.0,89.0,2.0,2.0,4.0,1.0,1280.0,0.0,0.0,0.0,0.0,0.71
2,0.0,103.0,3.0,4.0,4.0,2.0,1608.0,1.0,0.0,2.0,1.0,1.25
3,0.0,82.0,2.0,2.0,2.0,0.0,1084.0,0.0,0.0,1.0,2.0,0.95
4,0.0,74.0,3.0,3.0,3.0,3.0,1800.0,1.0,1.0,0.0,0.0,3.2


In [5]:
X = df.drop(columns=['price'])
y = df['price']

In [6]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [7]:
# one hot encode -> sector, balcony, agePossession, furnishing type,floor category

In [8]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type','floor_category']

In [9]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [15]:
# Creating a column transformer for preprocessing
# Modify this line according to your needs for handling unknown categories
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', encoder, columns_to_encode)
    ],
    remainder='passthrough'
)

In [16]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [17]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [18]:
scores.mean()

0.7917974111521545

In [19]:
scores.std()

0.02006933855318245

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [21]:
pipeline.fit(X_train,y_train)

In [22]:
y_pred = pipeline.predict(X_test)

In [23]:
y_pred = np.expm1(y_pred)

In [24]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.7578523496589293

In [25]:
from sklearn.svm import SVR

In [26]:
# Creating a pipeline
pipeline1 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [27]:
# K-fold cross-validation
kfold1 = KFold(n_splits=10, shuffle=True, random_state=42)
scores1 = cross_val_score(pipeline1, X, y_transformed, cv=kfold, scoring='r2')



In [28]:
scores1.mean()

0.8851610005006053

In [29]:
scores1.std()

0.012518121273929058

In [30]:
pipeline1.fit(X_train,y_train)

In [31]:
y_pred = pipeline1.predict(X_test)

In [32]:
y_pred = np.expm1(y_pred)

In [33]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.5277460157458749