### Import necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


### Data Loading and Cleaning

In [None]:
# Load the data
data = pd.read_csv('housing.csv')
data

In [None]:
# Information about the data (Non-null count and data type)
data.info()

In [None]:
# Drop the rows with missing or Nan values
data.dropna(inplace=True)

In [None]:
# Information about the data after dropping Nan values
data.info()

### Data preprocessing

In [None]:
# Split the data into features and target

x = data.drop('median_house_value', axis=1)
y = data['median_house_value']

In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Join the features and target for training and Test data
train_data = x_train.join(y_train)
test_data = x_test.join(y_test)


### Data Visualization

In [None]:
# Plot the histogram of the training data
train_data.hist(bins=50, figsize=(15, 8))

In [None]:
# Plot the heatmap of the correlation matrix of the training data
plt.figure(figsize=(15, 8))
sns.heatmap(train_data.corr(numeric_only=True), annot=True, cmap='coolwarm')

In [None]:
# Log transformation of the features which are right skewed

train_data['total_rooms'] = np.log(train_data['total_rooms'] + 1)
train_data['total_bedrooms'] = np.log(train_data['total_bedrooms'] + 1)
train_data['population'] = np.log(train_data['population'] + 1)
train_data['households'] = np.log(train_data['households'] + 1)

In [None]:
# Plot the histogram of the training data after log transformation
train_data.hist(bins=50, figsize=(15, 8))

In [None]:
# Value counts of the ocean_proximity column
train_data.ocean_proximity.value_counts()

In [None]:
# One hot encoding of the ocean_proximity column
dummies = pd.get_dummies(train_data.ocean_proximity).astype(int)
train_data = train_data.join(dummies).drop('ocean_proximity', axis=1)

In [None]:
# Plot the heatmap of the correlation matrix of the training data after one hot encoding
plt.figure(figsize=(15, 8))
sns.heatmap(train_data.corr(numeric_only=True), annot=True, cmap='coolwarm')

In [None]:
# Plot the scatter plot of latitude and longitude with median_house_value
plt.figure(figsize=(15, 8))
sns.scatterplot(x='latitude', y='longitude', data=train_data, hue='median_house_value', palette='coolwarm')

### Feature Engineering

In [None]:
# New Features 'bedroom_ratio', and 'household_rooms' are created

train_data["bedroom_ratio"] = train_data["total_bedrooms"]/train_data["total_rooms"]
train_data["household_rooms"] = train_data["total_rooms"]/train_data["households"]

In [None]:
# Heatmap of the correlation matrix with new features
plt.figure(figsize=(15, 8))
sns.heatmap(train_data.corr(numeric_only=True), annot=True, cmap='coolwarm')

### Saving the Training and Test data as a csv file

In [None]:
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)