In [19]:
# importing the libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [25]:
# getting dataset
data=pd.read_csv(r"C:\Users\Lenovo\Downloads\archive (15)\house_prices_dataset.csv")

In [26]:
data

Unnamed: 0,square_feet,num_rooms,age,distance_to_city(km),price
0,2248.357077,3,92,22.997972,200374.090410
1,1930.867849,2,22,13.984254,268784.847337
2,2323.844269,6,33,21.500945,315020.857676
3,2761.514928,3,63,10.343638,355111.468459
4,1882.923313,7,54,25.485200,234197.123903
...,...,...,...,...,...
9995,2650.551032,7,9,18.126034,431344.267823
9996,1000.827516,2,53,24.479692,6650.271134
9997,1647.341638,5,80,2.189312,233698.384301
9998,2247.882787,6,73,27.931014,280766.827379


In [27]:
# checking for the missing values
data.isnull().sum()

square_feet             0
num_rooms               0
age                     0
distance_to_city(km)    0
price                   0
dtype: int64

In [28]:
# basic data analysis
data.mean

<bound method DataFrame.mean of       square_feet  num_rooms  age  distance_to_city(km)          price
0     2248.357077          3   92             22.997972  200374.090410
1     1930.867849          2   22             13.984254  268784.847337
2     2323.844269          6   33             21.500945  315020.857676
3     2761.514928          3   63             10.343638  355111.468459
4     1882.923313          7   54             25.485200  234197.123903
...           ...        ...  ...                   ...            ...
9995  2650.551032          7    9             18.126034  431344.267823
9996  1000.827516          2   53             24.479692    6650.271134
9997  1647.341638          5   80              2.189312  233698.384301
9998  2247.882787          6   73             27.931014  280766.827379
9999  2322.194227          7   73             10.595721  342895.309114

[10000 rows x 5 columns]>

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   square_feet           10000 non-null  float64
 1   num_rooms             10000 non-null  int64  
 2   age                   10000 non-null  int64  
 3   distance_to_city(km)  10000 non-null  float64
 4   price                 10000 non-null  float64
dtypes: float64(3), int64(2)
memory usage: 390.8 KB


In [31]:
# getting data ready
x=data.drop(columns=['price'])
y=data['price']

In [32]:
# splitting the data into 80 20
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

In [33]:
# linear reggression 
lr=LinearRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


MAE: 15596.120539083644
MSE: 386443817.4602804
R² Score: 0.960063695761787


In [34]:
# Random Forest Reggression
rf=RandomForestRegressor()
rf.fit(x_train,y_train)
y_pred=lr.predict(x_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MAE: 15596.120539083644
MSE: 386443817.4602804
R² Score: 0.960063695761787


In [35]:
data.corr()

Unnamed: 0,square_feet,num_rooms,age,distance_to_city(km),price
square_feet,1.0,-0.006982,0.00976,0.004638,0.756545
num_rooms,-0.006982,1.0,0.005636,0.016747,0.33523
age,0.00976,0.005636,1.0,0.011175,-0.290799
distance_to_city(km),0.004638,0.016747,0.011175,1.0,-0.418217
price,0.756545,0.33523,-0.290799,-0.418217,1.0


In [36]:
# scaling the features
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

In [None]:
gr=RandomForestRegressor()
gr.fit(x_train,y_train)
y_pred=gr.predict(x_test)
print("R² Score:", r2_score(y_test, y_pred))

MAE: 18088.896501281233
MSE: 510223815.81621534
R² Score: 0.9472718863198977


In [38]:
param_grid={
    'n_estimators':[100,200],
    'max_depth':[10,20,None],
    'min_samples_split':[2,5]
}
grid_search=GridSearchCV(RandomForestRegressor(random_state=42),param_grid,cv=5,scoring='r2')
grid_search.fit(x_train,y_train)

print(f"Best R2 score after tuning : {grid_search.best_score_:.4f}")
print(f"Best parameters : {grid_search.best_params_}")

Best R2 score after tuning : 0.9448
Best parameters : {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
