### Step 1: Import The Data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("housing.csv")

### Step 2: Understand The Data

In [None]:
print(df.shape[0])
print(df.shape[1])

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
for column in df.columns:
    print(f"{column} : {len(df[column].unique())}")

### Step 3: Clean And Prepare The Data

In [None]:
df.dropna(inplace= True)

In [None]:
df

In [None]:
df.rename(columns={
    'housing_median_age': 'housingMedianAge',
    'total_rooms': 'totalRooms',
    'total_bedrooms': 'totalBedrooms',
    'median_income': 'medianIncome',
    'median_house_value': 'medianHouseValue',
    'ocean_proximity': 'oceanProximity',
}, inplace=True)

### Step 4: Split The Data Into Training/Test Sets

In [None]:
m = df.drop(['medianHouseValue'], axis=1)
n = df['medianHouseValue'] 

In [None]:
mTrain, mTest, nTrain, nTest = train_test_split(m, n, test_size= 0.2)

In [None]:
trainData = mTrain.join(nTrain)

In [None]:
trainData.hist(figsize= (15, 8))

plt.show()

### Step 5: Feature Engineering

In [None]:
trainData['totalRooms'] = np.log(trainData['totalRooms'] + 1)
trainData['totalBedrooms'] = np.log(trainData['totalBedrooms'] + 1)
trainData['population'] = np.log(trainData['population'] + 1)   
trainData['households'] = np.log(trainData['households'] + 1)

In [None]:
trainData = trainData.join(pd.get_dummies(trainData.oceanProximity)).drop(['oceanProximity'], axis= 1)

In [None]:
trainData['bedroomRatio'] = trainData['totalBedrooms'] / trainData['totalRooms']
trainData['householdRooms'] = trainData['totalRooms'] / trainData['households']

### Step 6: Linear Regression

In [None]:
scaler = StandardScaler()

mTrain, nTrain = trainData.drop(['medianHouseValue'], axis= 1), trainData['medianHouseValue']
mTrainS = scaler.fit_transform(mTrain) 

reg = LinearRegression()

reg.fit(mTrainS, nTrain)

In [None]:
testData = mTest.join(nTest)

testData['totalRooms'] = np.log(testData['totalRooms'] + 1)
testData['totalBedrooms'] = np.log(testData['totalBedrooms'] + 1)
testData['population'] = np.log(testData['population'] + 1)
testData['households'] = np.log(testData['households'] + 1)

testData = testData.join(pd.get_dummies(testData.oceanProximity)).drop(['oceanProximity'], axis= 1)

testData['bedroomRatio'] = testData['totalBedrooms'] / testData['totalRooms']
testData['householdRooms'] = testData['totalRooms'] / testData['households']

mTest, nTest = trainData.drop(['medianHouseValue'], axis= 1), trainData['medianHouseValue']

In [None]:
mTestS = scaler.transform(mTest)

In [None]:
reg.score(mTestS, nTest)

### Step 7: Random Forest Model

In [None]:
forest = RandomForestRegressor()

forest.fit(mTrainS, nTrain)

In [None]:
forest.score(mTestS, nTest)

In [None]:
forest = RandomForestRegressor()

paramGrid = {
    "n_estimators": [3, 10, 301],
    "max_features": [2, 4, 6, 8]
}

gridSearch = GridSearchCV(forest, paramGrid, cv=5, scoring= "neg_mean_squared_log_error", return_train_score= True)

gridSearch.fit(mTrainS, nTrain)



In [None]:
bestForest = gridSearch.best_estimator_

In [None]:
bestForest.score(mTest, nTest)

In [None]:
predictions = bestForest.predict(mTestS)

mse = mean_squared_error(nTest, predictions)
rmse = mse ** 0.5

print(f"Root Mean Squared Error (RMSE): {rmse}")