In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
data=pd.read_csv(r"D:\Users\DELL\Downloads\USA_Housing.csv")

In [4]:
data.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [11]:
data.isnull().sum()

Avg. Area Income                0
Avg. Area House Age             0
Avg. Area Number of Rooms       0
Avg. Area Number of Bedrooms    0
Area Population                 0
Price                           0
Address                         0
dtype: int64

In [12]:
data.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,68583.108984,5.977222,6.987792,3.98133,36163.516039,1232073.0
std,10657.991214,0.991456,1.005833,1.234137,9925.650114,353117.6
min,17796.63119,2.644304,3.236194,2.0,172.610686,15938.66
25%,61480.562388,5.322283,6.29925,3.14,29403.928702,997577.1
50%,68804.286404,5.970429,7.002902,4.05,36199.406689,1232669.0
75%,75783.338666,6.650808,7.665871,4.49,42861.290769,1471210.0
max,107701.748378,9.519088,10.759588,6.5,69621.713378,2469066.0


In [13]:
print(data,5)

      Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0         79545.458574             5.682861                   7.009188   
1         79248.642455             6.002900                   6.730821   
2         61287.067179             5.865890                   8.512727   
3         63345.240046             7.188236                   5.586729   
4         59982.197226             5.040555                   7.839388   
...                ...                  ...                        ...   
4995      60567.944140             7.830362                   6.137356   
4996      78491.275435             6.999135                   6.576763   
4997      63390.686886             7.250591                   4.805081   
4998      68001.331235             5.534388                   7.130144   
4999      65510.581804             5.992305                   6.792336   

      Avg. Area Number of Bedrooms  Area Population         Price  \
0                             4.09     230

In [18]:
data = data.drop("Address", axis=1)

In [19]:
X = data.drop("Price", axis=1)   
y = data["Price"] 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
rf = RandomForestRegressor(random_state=42)

In [22]:
param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5]
}

grid_search = GridSearchCV(rf, param_grid, cv=3, scoring="r2", n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("\n Best Random Forest Parameters ")
print(grid_search.best_params_)


Fitting 3 folds for each of 12 candidates, totalling 36 fits

 Best Random Forest Parameters 
{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}


In [23]:
best_rf = grid_search.best_estimator_
rf_pred = best_rf.predict(X_test)

In [24]:
lr = LinearRegression()
ridge = Ridge(alpha=1.0, random_state=42)
lasso = Lasso(alpha=0.001, random_state=42, max_iter=10000)

In [25]:
lr.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)

In [26]:
lr_pred = lr.predict(X_test)
ridge_pred = ridge.predict(X_test)
lasso_pred = lasso.predict(X_test)

In [27]:
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Performance:")
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"MSE: {mean_squared_error(y_true, y_pred):.4f}")
    print(f"R2 : {r2_score(y_true, y_pred):.4f}")

In [28]:
evaluate_model("Random Forest (Tuned)", y_test, rf_pred)


Random Forest (Tuned) Performance:
MAE: 94576.5197
MSE: 14442660364.5203
R2 : 0.8826


In [29]:
evaluate_model("Linear Regression", y_test, lr_pred)
evaluate_model("Ridge Regression", y_test, ridge_pred)
evaluate_model("Lasso Regression", y_test, lasso_pred)


Linear Regression Performance:
MAE: 80879.0972
MSE: 10089009300.8936
R2 : 0.9180

Ridge Regression Performance:
MAE: 80882.0807
MSE: 10089716572.5548
R2 : 0.9180

Lasso Regression Performance:
MAE: 80879.0973
MSE: 10089009312.9946
R2 : 0.9180
