In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error,r2_score


In [2]:
df=pd.read_csv('/content/MagicBricks.csv')

In [3]:
df.head(10)

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,15500000,Ready_to_move,New_Property,Builder_Floor,6667.0
6,1350.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,10000000,Ready_to_move,Resale,Builder_Floor,6667.0
7,650.0,2,2.0,Semi-Furnished,"Delhi Homes, Rohini Sector 24",1.0,4000000,Ready_to_move,New_Property,Apartment,6154.0
8,985.0,3,3.0,Unfurnished,Rohini Sector 21,1.0,6800000,Almost_ready,New_Property,Builder_Floor,6154.0
9,1300.0,4,4.0,Semi-Furnished,Rohini Sector 22,1.0,15000000,Ready_to_move,New_Property,Builder_Floor,6154.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Area         1259 non-null   float64
 1   BHK          1259 non-null   int64  
 2   Bathroom     1257 non-null   float64
 3   Furnishing   1254 non-null   object 
 4   Locality     1259 non-null   object 
 5   Parking      1226 non-null   float64
 6   Price        1259 non-null   int64  
 7   Status       1259 non-null   object 
 8   Transaction  1259 non-null   object 
 9   Type         1254 non-null   object 
 10  Per_Sqft     1018 non-null   float64
dtypes: float64(4), int64(2), object(5)
memory usage: 108.3+ KB


In [5]:
df.describe()

Unnamed: 0,Area,BHK,Bathroom,Parking,Price,Per_Sqft
count,1259.0,1259.0,1257.0,1226.0,1259.0,1018.0
mean,1466.452724,2.796664,2.556086,1.935563,21306700.0,15690.136542
std,1568.05504,0.954425,1.04222,6.279212,25601150.0,21134.738568
min,28.0,1.0,1.0,1.0,1000000.0,1259.0
25%,800.0,2.0,2.0,1.0,5700000.0,6364.0
50%,1200.0,3.0,2.0,1.0,14200000.0,11291.5
75%,1700.0,3.0,3.0,2.0,25500000.0,18000.0
max,24300.0,10.0,7.0,114.0,240000000.0,183333.0


In [6]:
df.shape

(1259, 11)

In [7]:
df.duplicated().sum()

83

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.isna().sum()

Area             0
BHK              0
Bathroom         1
Furnishing       5
Locality         0
Parking         31
Price            0
Status           0
Transaction      0
Type             5
Per_Sqft       227
dtype: int64

In [10]:
df['Per_Sqft'].fillna((df['Price']/df['Area']),inplace=True)
df['Bathroom'].fillna(df['Bathroom'].mode()[0],inplace=True)
df['Furnishing'].fillna(df['Furnishing'].mode()[0],inplace=True)
df['Parking'].fillna(df['Parking'].mode()[0],inplace=True)
df['Type'].fillna(df['Type'].mode()[0],inplace=True)

In [11]:
df.isna().sum()

Area           0
BHK            0
Bathroom       0
Furnishing     0
Locality       0
Parking        0
Price          0
Status         0
Transaction    0
Type           0
Per_Sqft       0
dtype: int64

In [12]:
df[['Parking','Bathroom']].astype('int64')

Unnamed: 0,Parking,Bathroom
0,1,2
1,1,2
2,1,2
3,1,2
4,1,2
...,...,...
1254,3,5
1255,3,2
1256,3,3
1257,1,2


In [24]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Area,1176.0,1447.543,1487.659,28.0,800.0,1172.5,1700.0,24300.0
BHK,1176.0,2.789966,0.9609931,1.0,2.0,3.0,3.0,10.0
Bathroom,1176.0,2.551871,1.052994,1.0,2.0,2.0,3.0,7.0
Parking,1176.0,1.953231,6.409197,1.0,1.0,1.0,2.0,114.0
Price,1176.0,21091730.0,25231740.0,1000000.0,5800000.0,14000000.0,26000000.0,240000000.0
Per_Sqft,1176.0,15108.51,19767.26,1250.0,6584.0,11111.0,17231.08,183333.0


**LABEL ENCODING**

In [25]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

def preprocess_data(df):
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

    scaler = MinMaxScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    categorical_cols = df.select_dtypes(include=['object']).columns

    label_encoder = LabelEncoder()
    for col in categorical_cols:
        if col in df.columns:
            df[col] = label_encoder.fit_transform(df[col])

    return df

preprocess_data(df)

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,0.031806,0.222222,0.166667,1,283,0.000000,0.023013,1,0,1,0.037758
1,0.029746,0.111111,0.166667,1,139,0.000000,0.016736,1,0,0,0.029750
2,0.037986,0.111111,0.166667,0,49,0.000000,0.060669,1,1,0,0.029750
3,0.023566,0.111111,0.166667,1,281,0.000000,0.013389,1,1,1,0.029750
4,0.025626,0.111111,0.166667,1,282,0.000000,0.021757,1,0,1,0.029750
...,...,...,...,...,...,...,...,...,...,...,...
1254,0.168507,0.333333,0.666667,2,44,0.017699,0.225941,1,0,1,0.064070
1255,0.042106,0.222222,0.166667,1,44,0.017699,0.048117,1,1,1,0.064070
1256,0.034896,0.222222,0.333333,1,44,0.017699,0.069038,1,0,1,0.064070
1257,0.039634,0.111111,0.166667,2,45,0.000000,0.043933,1,1,1,0.064070


In [45]:
X=df.drop(columns='Price')

In [46]:
Y=df['Price']

In [47]:
X

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Per_Sqft
0,0.031806,0.222222,0.166667,1,283,0.000000,1,0,1,0.037758
1,0.029746,0.111111,0.166667,1,139,0.000000,1,0,0,0.029750
2,0.037986,0.111111,0.166667,0,49,0.000000,1,1,0,0.029750
3,0.023566,0.111111,0.166667,1,281,0.000000,1,1,1,0.029750
4,0.025626,0.111111,0.166667,1,282,0.000000,1,0,1,0.029750
...,...,...,...,...,...,...,...,...,...,...
1254,0.168507,0.333333,0.666667,2,44,0.017699,1,0,1,0.064070
1255,0.042106,0.222222,0.166667,1,44,0.017699,1,1,1,0.064070
1256,0.034896,0.222222,0.333333,1,44,0.017699,1,0,1,0.064070
1257,0.039634,0.111111,0.166667,2,45,0.000000,1,1,1,0.064070


In [48]:
Y


0       0.023013
1       0.016736
2       0.060669
3       0.013389
4       0.021757
          ...   
1254    0.225941
1255    0.048117
1256    0.069038
1257    0.043933
1258    0.073222
Name: Price, Length: 1176, dtype: float64

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

**LINEAR REGRESSION**

In [50]:
model1 = LinearRegression()
model1.fit(X_train, Y_train)


In [51]:
predictions = model1.predict(X_test)

Result

In [53]:
result = model1.predict([[0.031806,	0.222222,	0.166667,	1,	283,	0.000000,	1,	0,	1,	0.037758]])
print(result)

[0.04133126]




In [54]:
from sklearn import metrics

In [55]:
predictions = model1.predict(X_test)

**ERRORS**

In [60]:
print('MAE:', metrics.mean_absolute_error(Y_test, predictions))
print('MSE:', metrics.mean_squared_error(Y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))
print('R-2 score:',metrics.r2_score(Y_test, predictions))

MAE: 0.04379146318105323
MSE: 0.0072135374556155955
RMSE: 0.08493254650377319
R-2 score: 0.553173418264103


**Decision Tree**

In [61]:
from sklearn.tree import DecisionTreeRegressor
model2=DecisionTreeRegressor(random_state=0)
model2.fit(X_train, Y_train)

In [66]:
predictions= model2.predict(X_test)

Result

In [67]:
result = model2.predict([[0.031806,	0.222222,	0.166667,	1,	283,	0.000000,	1,	0,	1,	0.037758]])
print(result)

[0.02301255]




**ERRORS**

In [72]:
print('MAE:', metrics.mean_absolute_error(Y_test, predictions))
print('MSE:', metrics.mean_squared_error(Y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))
print('R-2 score:',metrics.r2_score(Y_test, predictions))

MAE: 0.02623803276363379
MSE: 0.0021707844172005964
RMSE: 0.04659167755297717
R-2 score: 0.8655355729707627


**RIDGE**

In [86]:
from sklearn.linear_model import Ridge, Lasso

In [97]:
model3= Ridge(alpha=2)

model3.fit(X_train,Y_train)

In [101]:
predictions = model3.predict(X_test)

In [102]:
result = model3.predict([[0.031806,	0.222222,	0.166667,	1,	283,	0.000000,	1,	0,	1,	0.037758]])
print(result)

[0.04955438]




**ERRORS**

In [100]:
print('MAE:', metrics.mean_absolute_error(Y_test, predictions))
print('MSE:', metrics.mean_squared_error(Y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))
print('R-2 score:',metrics.r2_score(Y_test, predictions))

MAE: 0.04437760422479739
MSE: 0.007712318865470661
RMSE: 0.08781980907216015
R-2 score: 0.522277509873768


**RANDOM FOREST**

In [81]:
from sklearn.ensemble import RandomForestRegressor
model4 = RandomForestRegressor(n_estimators=300, random_state=0)
model4.fit(X_train, Y_train)

In [82]:
predictions= model4.predict(X_test)

In [83]:
result = model4.predict([[0.031806,	0.222222,	0.166667,	1,	283,	0.000000,	1,	0,	1,	0.037758]])
print(result)

[0.02379819]




**ERRORS**

In [85]:
print('MAE:', metrics.mean_absolute_error(Y_test, predictions))
print('MSE:', metrics.mean_squared_error(Y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))
print('R-2 score:',metrics.r2_score(Y_test, predictions))

MAE: 0.02370174514809827
MSE: 0.002124278524643202
RMSE: 0.046089896123154825
R-2 score: 0.8684162773588463


**XG BOOST**

In [117]:
import xgboost as xg

In [119]:
model5= xg.XGBRegressor(objective ='reg:linear',
                  n_estimators = 10, seed = 123)

In [120]:
model5.fit(X_train, Y_train)



In [121]:
predictions = model5.predict(X_test)

Result

In [122]:
result = model5.predict([[0.031806,	0.222222,	0.166667,	1,	283,	0.000000,	1,	0,	1,	0.037758]])
print(result)

[0.03053005]


**ERRORS**

In [123]:
print('MAE:', metrics.mean_absolute_error(Y_test, predictions))
print('MSE:', metrics.mean_squared_error(Y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))
print('R-2 score:',metrics.r2_score(Y_test, predictions))

MAE: 0.023381477365494552
MSE: 0.001891936061486038
RMSE: 0.043496391361652495
R-2 score: 0.8828082160218659


**LASSO**

In [113]:
model6= Lasso(alpha=5)

model6.fit(X_train,Y_train)

In [114]:
predictions = model6.predict(X_test)

In [115]:
result = model6.predict([[0.031806,	0.222222,	0.166667,	1,	283,	0.000000,	1,	0,	1,	0.037758]])
print(result)

[0.08232471]




In [116]:
print('MAE:', metrics.mean_absolute_error(Y_test, predictions))
print('MSE:', metrics.mean_squared_error(Y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))
print('R-2 score:',metrics.r2_score(Y_test, predictions))

MAE: 0.0723210210683715
MSE: 0.01621920257130649
RMSE: 0.12735463309713743
R-2 score: -0.004662537348741447
