## Loading Dataset from pandas

In [17]:
import pandas as pd
ds_path = 'Dataset/ParisHousing.csv'
ds = pd.read_csv(ds_path)
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   squareMeters       10000 non-null  int64  
 1   numberOfRooms      10000 non-null  int64  
 2   hasYard            10000 non-null  int64  
 3   hasPool            10000 non-null  int64  
 4   floors             10000 non-null  int64  
 5   cityCode           10000 non-null  int64  
 6   cityPartRange      10000 non-null  int64  
 7   numPrevOwners      10000 non-null  int64  
 8   made               10000 non-null  int64  
 9   isNewBuilt         10000 non-null  int64  
 10  hasStormProtector  10000 non-null  int64  
 11  basement           10000 non-null  int64  
 12  attic              10000 non-null  int64  
 13  garage             10000 non-null  int64  
 14  hasStorageRoom     10000 non-null  int64  
 15  hasGuestRoom       10000 non-null  int64  
 16  price              1000

In [18]:
ds.describe()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,49870.1312,50.3584,0.5087,0.4968,50.2763,50225.4861,5.5101,5.5217,2005.4885,0.4991,0.4999,5033.1039,5028.0106,553.1212,0.503,4.9946,4993448.0
std,28774.37535,28.816696,0.499949,0.500015,28.889171,29006.675799,2.872024,2.856667,9.30809,0.500024,0.500025,2876.729545,2894.33221,262.05017,0.500016,3.17641,2877424.0
min,89.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,1990.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,10313.5
25%,25098.5,25.0,0.0,0.0,25.0,24693.75,3.0,3.0,1997.0,0.0,0.0,2559.75,2512.0,327.75,0.0,2.0,2516402.0
50%,50105.5,50.0,1.0,0.0,50.0,50693.0,5.0,5.0,2005.5,0.0,0.0,5092.5,5045.0,554.0,1.0,5.0,5016180.0
75%,74609.75,75.0,1.0,1.0,76.0,75683.25,8.0,8.0,2014.0,1.0,1.0,7511.25,7540.5,777.25,1.0,8.0,7469092.0
max,99999.0,100.0,1.0,1.0,100.0,99953.0,10.0,10.0,2021.0,1.0,1.0,10000.0,10000.0,1000.0,1.0,10.0,10006770.0


## Dataset Pre-processing

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split features and labels
X = ds.drop(columns=['price'])
Y = ds['price']

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

# Standardization (fit on training only)
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

# Convert back to DataFrames with proper indexes
X_train_norm_df = pd.DataFrame(X_train_norm, columns=X.columns, index=X_train.index)
X_test_norm_df  = pd.DataFrame(X_test_norm,  columns=X.columns, index=X_test.index)

# Check structure
X_train_norm_df.info()
X_test_norm_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 4989 to 9412
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   squareMeters       8000 non-null   float64
 1   numberOfRooms      8000 non-null   float64
 2   hasYard            8000 non-null   float64
 3   hasPool            8000 non-null   float64
 4   floors             8000 non-null   float64
 5   cityCode           8000 non-null   float64
 6   cityPartRange      8000 non-null   float64
 7   numPrevOwners      8000 non-null   float64
 8   made               8000 non-null   float64
 9   isNewBuilt         8000 non-null   float64
 10  hasStormProtector  8000 non-null   float64
 11  basement           8000 non-null   float64
 12  attic              8000 non-null   float64
 13  garage             8000 non-null   float64
 14  hasStorageRoom     8000 non-null   float64
 15  hasGuestRoom       8000 non-null   float64
dtypes: float64(16)
memory usag

---
### Baseline Linear Regression on full dataset

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# baseline Linear Regression Training on full dataset
baseline_lr = LinearRegression()
baseline_lr.fit(X_train_norm_df, Y_train)

# baseline Linear Regression Prediction
Y_pred_baseline = baseline_lr.predict(X_test_norm_df)

mse = mean_squared_error(Y_test, Y_pred_baseline)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, Y_pred_baseline)

print(f"Baseline Linear Regression Results:\nRMSE: {rmse:,.2f}\nR2 Score: {r2:.4f}\n")

Baseline Linear Regression Results:
RMSE: 1,935.82
R2 Score: 1.0000



---
#### Traning independent Linear Regression Models grouped by numPrevOwners(range from 1-10)

In [21]:
# Train separate Linear Regression models per `numPrevOwners`
from IPython.display import display
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Expect the following to exist from previous cells:
# ds, X_train, X_test, Y_train, Y_test, X_train_norm_df, X_test_norm_df

owners_values = sorted(ds['numPrevOwners'].unique())
results = []
group_models = {}

for owners in owners_values:
    # Indices for this group in the pre-defined train/test split
    train_idx = X_train.index[X_train['numPrevOwners'] == owners]
    test_idx = X_test.index[X_test['numPrevOwners'] == owners]

    # Skip if not enough data
    if len(train_idx) < 2 or len(test_idx) < 1:
        results.append({
            'numPrevOwners': int(owners),
            'n_train': int(len(train_idx)),
            'n_test': int(len(test_idx)),
            'rmse': np.nan,
            'r2': np.nan
        })
        continue

    # Use normalized features; drop the grouping column since it's constant within a group
    Xtr = X_train_norm_df.loc[train_idx].drop(columns=['numPrevOwners'])
    ytr = Y_train.loc[train_idx]
    Xte = X_test_norm_df.loc[test_idx].drop(columns=['numPrevOwners'])
    yte = Y_test.loc[test_idx]

    model = LinearRegression().fit(Xtr, ytr)
    group_models[int(owners)] = model

    ypred = model.predict(Xte)
    rmse = float(np.sqrt(mean_squared_error(yte, ypred)))
    r2 = float(r2_score(yte, ypred)) if len(yte) > 1 else np.nan

    results.append({
        'numPrevOwners': int(owners),
        'n_train': int(len(train_idx)),
        'n_test': int(len(test_idx)),
        'rmse': rmse,
        'r2': r2
    })

metrics_df = pd.DataFrame(results).sort_values('numPrevOwners').reset_index(drop=True)
display(metrics_df)
print("Trained group models saved in variable: group_models")

Unnamed: 0,numPrevOwners,n_train,n_test,rmse,r2
0,1,761,191,1919.411466,1.0
1,2,779,208,1804.24739,1.0
2,3,776,215,1980.980517,1.0
3,4,833,210,1959.639991,1.0
4,5,844,192,2048.306468,1.0
5,6,797,214,1864.17785,1.0
6,7,792,182,2119.996325,0.999999
7,8,798,173,1988.071695,1.0
8,9,822,214,1959.673911,1.0
9,10,798,201,1870.215516,1.0


Trained group models saved in variable: group_models


---
Apply the K-means (with K set by yourself) to cluster the dataset based on feature similarity. Train
and evaluate a distinct linear regression model for each cluster and evaluate it separately. Summarize
the overall performance across all subsets.

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd


ds = pd.read_csv(ds_path)
# Split features and labels
X = ds.drop(columns=['price'])
Y = ds['price']

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

# Standardization (fit on training only)
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

# Convert back to DataFrames with proper indexes
X_train_norm_df = pd.DataFrame(X_train_norm, columns=X.columns, index=X_train.index)
X_test_norm_df  = pd.DataFrame(X_test_norm,  columns=X.columns, index=X_test.index)

# Check structure
X_train_norm_df.info()
X_test_norm_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 4989 to 9412
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   squareMeters       8000 non-null   float64
 1   numberOfRooms      8000 non-null   float64
 2   hasYard            8000 non-null   float64
 3   hasPool            8000 non-null   float64
 4   floors             8000 non-null   float64
 5   cityCode           8000 non-null   float64
 6   cityPartRange      8000 non-null   float64
 7   numPrevOwners      8000 non-null   float64
 8   made               8000 non-null   float64
 9   isNewBuilt         8000 non-null   float64
 10  hasStormProtector  8000 non-null   float64
 11  basement           8000 non-null   float64
 12  attic              8000 non-null   float64
 13  garage             8000 non-null   float64
 14  hasStorageRoom     8000 non-null   float64
 15  hasGuestRoom       8000 non-null   float64
dtypes: float64(16)
memory usag

### Perform K-Means

In [23]:
from sklearn.cluster import KMeans

k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
train_clusters = kmeans.fit_predict(X_train_norm_df)
test_clusters = kmeans.predict(X_test_norm_df)
print("Train clusters:", np.unique(train_clusters, return_counts=True))
print("Test clusters:", np.unique(test_clusters, return_counts=True))

Train clusters: (array([0, 1, 2], dtype=int32), array([2003, 2007, 3990]))
Test clusters: (array([0, 1, 2], dtype=int32), array([499, 521, 980]))


In [24]:
from sklearn.linear_model import LinearRegression
import numpy as np

models = {}
for c in range(k):
    idx = np.where(train_clusters == c)[0]
    Xc = X_train_norm_df.iloc[idx]
    yc = Y_train.iloc[idx]

    model = LinearRegression()
    model.fit(Xc, yc)
    models[c] = model

In [25]:
import numpy as np

# 1. Assign each test sample to a cluster
test_clusters = kmeans.predict(X_test_norm_df)

# 2. Prepare an array/Series to store predictions
#    Keep same index as Y_test (assuming it's a pandas Series)
y_test_pred = np.empty(len(Y_test), dtype=float)
y_test_pred[:] = np.nan  # just in case any cluster has no model

# 3. Predict per cluster using its corresponding model
for c in range(k):
    # indices of test samples in cluster c
    idx = np.where(test_clusters == c)[0]
    if len(idx) == 0:
        continue  # no test samples in this cluster

    # skip if this cluster had no trained model (in case it was empty in train set)
    if c not in models:
        continue

    Xc_test = X_test_norm_df.iloc[idx]
    model_c = models[c]

    y_test_pred[idx] = model_c.predict(Xc_test)

# Convert to pandas Series aligned with Y_test index (optional but convenient)
import pandas as pd
y_test_pred = pd.Series(y_test_pred, index=Y_test.index)


In [26]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
import pandas as pd
import numpy as np

cluster_results = []

for c in range(k):
    idx = np.where(test_clusters == c)[0]
    if len(idx) == 0:
        continue  # no test data in this cluster

    y_true_c = Y_test.iloc[idx]
    y_pred_c = y_test_pred.iloc[idx]

    mae_c = mean_absolute_error(y_true_c, y_pred_c)
    rmse_c = root_mean_squared_error(y_true_c, y_pred_c, )
    r2_c = r2_score(y_true_c, y_pred_c)

    cluster_results.append({
        "cluster": c,
        "n_test": len(idx),
        "MAE": mae_c,
        "RMSE": rmse_c,
        "R2": r2_c
    })

cluster_results_df = pd.DataFrame(cluster_results).sort_values("cluster")
print("Per-cluster performance:")
display(cluster_results_df)

# Overall performance on test set
overall_mae = mean_absolute_error(Y_test, y_test_pred)
overall_rmse = root_mean_squared_error(Y_test, y_test_pred)
overall_r2 = r2_score(Y_test, y_test_pred)

print("\nOverall K-means segmented linear regression performance:")
print(f"MAE  : {overall_mae:.2f}")
print(f"RMSE : {overall_rmse:.2f}")
print(f"R^2  : {overall_r2:.4f}")


Per-cluster performance:



Unnamed: 0,cluster,n_test,MAE,RMSE,R2
0,0,499,1650.181211,2091.290269,0.999999
1,1,521,1414.001619,1810.082049,1.0
2,2,980,1492.570791,1930.332163,1.0



Overall K-means segmented linear regression performance:
MAE  : 1511.43
RMSE : 1941.78
R^2  : 1.0000
