# California house price prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

Data Preparation

In [None]:
filename = "/kaggle/input/house-price/1553768847-housing.csv"
housing_df = pd.read_csv(filename)
housing_df.head()

In [None]:
housing_df.info()

Data Process

In [None]:
# shuffle the data
housing_df = housing_df.sample(n = len(housing_df),random_state=1)
housing_df.head()

In [None]:
housing_df.isnull().sum()

In [None]:
# fill null values wiht the mean of the respective column
housing_df["total_bedrooms"].fillna(housing_df["total_bedrooms"].mean(),inplace=True)

In [None]:
housing_df.isnull().sum()

In [None]:
housing_df['ocean_proximity'].value_counts()

In [None]:
# convert categorical data to numerical (one hot encoded)
pd.get_dummies(housing_df['ocean_proximity']).head()

In [None]:
housing_df.drop("ocean_proximity",axis = 1).head()

In [None]:
housing_df = pd.concat([housing_df.drop("ocean_proximity",axis = 1),pd.get_dummies(housing_df['ocean_proximity'])],axis = 1)
housing_df

In [None]:
# Rearrange the columns
housing_df = housing_df[['longitude','latitude','housing_median_age','total_rooms',
                         'total_bedrooms','population','households','median_income',
                         '<1H OCEAN','INLAND','ISLAND','NEAR BAY','NEAR OCEAN','median_house_value']]
housing_df.info()

Data Visualization

In [None]:
# Extract numerical features
numerical_features = ['longitude','latitude','housing_median_age','total_rooms',
                         'total_bedrooms','population','households','median_income']
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(12, 12))  # Adjust figsize as needed

# Flatten the axes array to iterate over the subplots
axes = axes.flatten()

# Scatter plots for each numerical feature
for i, feature in enumerate(numerical_features):
    axes[i].scatter(housing_df[feature], housing_df['median_house_value'], s=10)
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Median House Value')

# Remove any unused subplots
for j in range(len(numerical_features), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

## Modeling

Split the data

In [None]:
train,test,validation = housing_df[:18000],housing_df[18000:19215],housing_df[19215:]
len(train),len(test),len(validation)

In [None]:
x_train , y_train = train.to_numpy()[:,:-1],train.to_numpy()[:,-1]
x_valid , y_valid = validation.to_numpy()[:,:-1],validation.to_numpy()[:,-1]
x_test , y_test = test.to_numpy()[:,:-1],test.to_numpy()[:,-1]

x_train.shape, y_train.shape, x_valid.shape, y_valid.shape,x_test.shape, y_test.shape

In [None]:
scaler = StandardScaler().fit(x_train[:, :8])
def preprocessor(data):
  A = np.copy(data)
  A[:, :8] = scaler.transform(A[:,:8])
  return A

x_train,x_valid,x_test = preprocessor(x_train), preprocessor(x_valid), preprocessor(x_test)

In [None]:
model = LinearRegression().fit(x_train, y_train)

# Make predictions on the training, validation, and test sets
y_train_pred = model.predict(x_train)
y_valid_pred = model.predict(x_valid)
y_test_pred = model.predict(x_test)

# Calculate the Root Mean Squared Error (RMSE) for training, validation, and test sets
rmse_train = np.sqrt(mse(y_train, y_train_pred))
rmse_valid = np.sqrt(mse(y_valid, y_valid_pred))
rmse_test = np.sqrt(mse(y_test, y_test_pred))

# Calculate the R-squared (R2) score for the model on the validation set
r2 = r2_score(y_valid, y_valid_pred)

# Print RMSE and R2 scores along with some explanatory text
print(f"Root Mean Squared Error (RMSE) - Training: {rmse_train:.4f}")
print(f"Root Mean Squared Error (RMSE) - Validation: {rmse_valid:.4f}")
print(f"Root Mean Squared Error (RMSE) - Test: {rmse_test:.4f}")
print(f"R-squared (R2) Score - Validation: {r2:.4f}")

In [None]:
y_test = y_test.astype('float64')

In [None]:

y_pred = model.predict(x_test)
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)

# Add labels and title
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Linear Regression: Actual vs. Predicted Values")

# Add a regression line (optional)
sns.regplot(x=y_test, y=y_pred, scatter=False, color='red')

# Show the plot
plt.show()

In [None]:
model_2 = GradientBoostingRegressor(n_estimators=250).fit(x_train, y_train)

# Make predictions on the training and validation sets
y_train_pred = model_2.predict(x_train)
y_valid_pred = model_2.predict(x_valid)

# Calculate the Root Mean Squared Error (RMSE) for training and validation sets
rmse_train = np.sqrt(mse(y_train, y_train_pred))
rmse_valid = np.sqrt(mse(y_valid, y_valid_pred))
rmse_test = np.sqrt(mse(y_test, y_test_pred))

# Calculate the R-squared (R2) score for the model
r2 = r2_score(y_valid, y_valid_pred)

# Print RMSE and R2 scores along with some explanatory text
print(f"Root Mean Squared Error (RMSE) - Training: {rmse_train:.4f}")
print(f"Root Mean Squared Error (RMSE) - Validation: {rmse_valid:.4f}")
print(f"Root Mean Squared Error (RMSE) - Test: {rmse_test:.4f}")
print(f"R-squared (R2) Score - Validation: {r2:.4f}")

In [None]:

y_pred = model_2.predict(x_test)
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)

# Add labels and title
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Gradient Boosting Regression: Actual vs. Predicted Values")

# Add a regression line (optional)
sns.regplot(x=y_test, y=y_pred, scatter=False, color='red')

# Show the plot
plt.show()