In [1]:
import pandas as pd
import numpy as np
import warnings   
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
from sklearn.datasets import fetch_california_housing
# as_frame=True loads the data in a dataframe format, with other metadata besides it
california_housing = fetch_california_housing(as_frame=True)
# Select only the dataframe part and assign it to the data variable
data = california_housing.frame

In [3]:
data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [4]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [6]:
data.isnull().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

##### Explore the impact of removal of outliers on accuracy.

In [7]:
y = data['MedHouseVal']
X = data.drop(['MedHouseVal','AveRooms','MedInc'], axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


In [9]:
# Train a linear regression model on the original data
model = LinearRegression()
model.fit(X_train, y_train)

In [10]:
# Evaluate the model's accuracy on the original data
y_pred = model.predict(X_test)
original_mse = mean_squared_error(y_test, y_pred)
print("Original Mean Squared Error (MSE):", original_mse)

Original Mean Squared Error (MSE): 0.9815533139131158


In [11]:
# Identify and remove outliers using the Z-score method
z_scores = np.abs((y_train - np.mean(y_train)) / np.std(y_train))
outlier_indices = np.where(z_scores > 3)
X_train_cleaned = np.delete(X_train, outlier_indices, axis=0)
y_train_cleaned = np.delete(y_train, outlier_indices)

In [12]:
# Train a new model on the cleaned data
model_cleaned = LinearRegression()
model_cleaned.fit(X_train_cleaned, y_train_cleaned)

In [13]:
# Evaluate the accuracy of the new model on the test set
y_pred_cleaned = model_cleaned.predict(X_test)
cleaned_mse = mean_squared_error(y_test, y_pred_cleaned)
print("Mean Squared Error after Outlier Removal (MSE):", cleaned_mse)

Mean Squared Error after Outlier Removal (MSE): 0.9815533139131206


###### Implement weighted KNN on the same data set and find accuracy improvement. (Hint : explore different parameters of the function KNeighborRegression()).

In [14]:
# Standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
# Basic KNN Regression
from math import sqrt
from sklearn.neighbors import KNeighborsRegressor
k = 5  # Number of neighbors
knn_basic = KNeighborsRegressor(n_neighbors=k)
knn_basic.fit(X_train, y_train)
y_pred_basic = knn_basic.predict(X_test)
mse_basic = mean_squared_error(y_test, y_pred_basic)
rmse_basic = sqrt(mse_basic)
print("Basic KNN RMSE:", rmse_basic)

Basic KNN RMSE: 0.8699039579272689


In [16]:
# Weighted KNN Regression
k = 5  # Number of neighbors
weights = 'distance'  # Use distance-based weights
knn_weighted = KNeighborsRegressor(n_neighbors=k, weights=weights)
knn_weighted.fit(X_train, y_train)
y_pred_weighted = knn_weighted.predict(X_test)
mse_weighted = mean_squared_error(y_test, y_pred_weighted)
rmse_weighted = sqrt(mse_weighted)
print("Weighted KNN RMSE:", rmse_weighted)

Weighted KNN RMSE: 0.8640221542745011


In [17]:
# Calculate and print the improvement in accuracy
accuracy_improvement = (rmse_basic - rmse_weighted) / rmse_basic
print("Accuracy Improvement:", accuracy_improvement)

Accuracy Improvement: 0.006761440270696607
