# Assignment 3

Name - Aziz Sayyad, Roll - 381069, PRN - 22420090, Batch - P2

"Implement a Federated Learning framework for a linear regression task where multiple clients 
collaboratively learn a global prediction model without sharing their raw data. For a house price 
dataset can be split across clients, each client trains a local linear regression model, and a central server applies Federated Averaging (FedAvg) to combine the local model parameters into a 
global model while preserving data privacy. "

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


In [4]:
# Loading dataset

df = pd.read_excel("Housing.xlsx")

# Display first 5 rows
df.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [5]:
# Check dataset information
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())

# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Fill numeric columns using mean
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill categorical columns using mode
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

print("\nMissing values after cleaning:\n", df.isnull().sum())


Dataset Shape: (545, 13)

Missing Values:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

Missing values after cleaning:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [6]:
# Convert categorical variables into numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,True,False,False,False,True,True,False,False
1,12250000,8960,4,4,4,3,True,False,False,False,True,False,False,False
2,12250000,9960,3,2,2,2,True,False,True,False,False,True,True,False
3,12215000,7500,4,2,2,3,True,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,2,True,True,True,False,True,False,False,False


In [7]:
# Separate features (X) and target (y)

X = df.drop("price", axis=1)
y = df["price"]

print("Feature matrix shape:", X.shape)


Feature matrix shape: (545, 13)


In [8]:
# Normalize features between 0 and 1

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

X_scaled.head()


Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,0.396564,0.6,0.333333,0.666667,0.666667,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.502405,0.6,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.571134,0.4,0.333333,0.333333,0.666667,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0.402062,0.6,0.333333,0.333333,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,0.396564,0.6,0.0,0.333333,0.666667,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


In [9]:
# Number of clients
num_clients = 3

# Shuffle dataset before splitting
indices = np.random.permutation(len(X_scaled))
X_scaled = X_scaled.iloc[indices]
y = y.iloc[indices]

client_data = []

# Equal data distribution
data_per_client = len(X_scaled) // num_clients

for i in range(num_clients):
    
    start = i * data_per_client
    end = (i + 1) * data_per_client
    
    X_client = X_scaled.iloc[start:end].values
    y_client = y.iloc[start:end].values
    
    client_data.append((X_client, y_client))

print("Data successfully distributed to", num_clients, "clients")


Data successfully distributed to 3 clients


In [10]:
# Initializing global weights and bias

num_features = X_scaled.shape[1]

global_weights = np.zeros(num_features)
global_bias = 0

In [11]:
def train_local_model(X, y, weights, bias, lr=0.01, epochs=100):
    """
    Trains linear regression using gradient descent on local client data.
    """
    
    w = weights.copy()
    b = bias
    n = len(X)
    
    for _ in range(epochs):
        
        # Predictions
        y_pred = np.dot(X, w) + b
        
        # Compute gradients
        dw = (1/n) * np.dot(X.T, (y_pred - y))
        db = (1/n) * np.sum(y_pred - y)
        
        # Update parameters
        w -= lr * dw
        b -= lr * db
        
    return w, b

In [16]:
# Number of federated communication rounds
rounds = 25   

for r in range(rounds):
    
    local_weights = []
    local_biases = []
    
    # Each client trains locally on private data
    for X_client, y_client in client_data:
        
        w, b = train_local_model(
            X_client,
            y_client,
            global_weights,
            global_bias
        )
        
        # Clients send only model parameters
        local_weights.append(w)
        local_biases.append(b)
    
    # Server performs Federated Averaging
    global_weights = np.mean(local_weights, axis=0)
    global_bias = np.mean(local_biases)
    
    # Evaluate global model
    y_pred_global = np.dot(X_scaled.values, global_weights) + global_bias
    mse = mean_squared_error(y, y_pred_global)
    
    # Clean  output
    print(f"Round {r+1:02d} | Global MSE: {mse:.2f}")

Round 01 | Global MSE: 1150753649951.98
Round 02 | Global MSE: 1149331052009.89
Round 03 | Global MSE: 1147965367060.93
Round 04 | Global MSE: 1146653880443.99
Round 05 | Global MSE: 1145394049021.67
Round 06 | Global MSE: 1144183486157.59
Round 07 | Global MSE: 1143019948372.20
Round 08 | Global MSE: 1141901323463.55
Round 09 | Global MSE: 1140825619908.78
Round 10 | Global MSE: 1139790957386.44
Round 11 | Global MSE: 1138795558281.35
Round 12 | Global MSE: 1137837740051.89
Round 13 | Global MSE: 1136915908355.75
Round 14 | Global MSE: 1136028550843.52
Round 15 | Global MSE: 1135174231541.54
Round 16 | Global MSE: 1134351585755.60
Round 17 | Global MSE: 1133559315435.77
Round 18 | Global MSE: 1132796184950.42
Round 19 | Global MSE: 1132061017224.01
Round 20 | Global MSE: 1131352690198.86
Round 21 | Global MSE: 1130670133586.29
Round 22 | Global MSE: 1130012325876.57
Round 23 | Global MSE: 1129378291580.90
Round 24 | Global MSE: 1128767098682.03
Round 25 | Global MSE: 1128177856272.80


In [17]:
print("\nFinal Global Model Parameters")
print("Weights:", global_weights)
print("Bias:", global_bias)


Final Global Model Parameters
Weights: [2753233.02746817 1022284.26147182 2555326.23722739 1338616.50897409
  993634.79816038  496410.1599519   343949.40182197  322993.84187764
  851480.66854438  904579.24561809  694228.30174578  -58864.70823837
 -428022.72036856]
Bias: 1947099.7648754306
