# Assignment 2

Name - Aziz Sayyad, Roll - 381069, PRN - 22420090, Batch - P2

"To perform data pre-processing and partitioning for a Federated Learning system using a real 
dataset such as a student performance dataset (e.g., study hours, attendance, internal marks, final 
score). The dataset is cleaned by handling missing values and normalization, then split into 
multiple local datasets and distributed among participating devices or nodes so that each client 
trains on its own private data without sharing raw information."

In [1]:
# imports

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [4]:
df = pd.read_csv("student_performance.csv")
df.head()


Unnamed: 0,study_hours,attendance,internal_marks,final_score
0,5,80,25,60
1,7,90,30,75
2,4,70,20,55
3,6,85,28,70


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   study_hours     4 non-null      int64
 1   attendance      4 non-null      int64
 2   internal_marks  4 non-null      int64
 3   final_score     4 non-null      int64
dtypes: int64(4)
memory usage: 260.0 bytes


In [9]:
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())

Dataset Shape: (4, 4)

Missing Values:
 study_hours       0
attendance        0
internal_marks    0
final_score       0
dtype: int64


In [14]:
# Replace missing values with column mean (mean imputation)
# This ensures no data loss while cleaning the dataset
df.fillna(df.mean(), inplace=True)

# Verify that missing values are handled
print("Missing values after cleaning:\n", df.isnull().sum())


Missing values after cleaning:
 study_hours       0
attendance        0
internal_marks    0
final_score       0
dtype: int64


In [15]:
# Separate input features (X) and target variable (y)
# final_score is the output we want to predict

X = df.drop("final_score", axis=1)
y = df["final_score"]

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)


Feature matrix shape: (4, 3)
Target vector shape: (4,)


In [13]:
# Normalize features

# we scale only input features, not the target variable

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled.head()

Unnamed: 0,study_hours,attendance,internal_marks
0,0.333333,0.5,0.5
1,1.0,1.0,1.0
2,0.0,0.0,0.0
3,0.666667,0.75,0.8


In [16]:
# Initialize MinMaxScaler to scale features between 0 and 1
scaler = MinMaxScaler()

# Fit and transform feature matrix
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for better readability
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Display normalized data
X_scaled.head()


Unnamed: 0,study_hours,attendance,internal_marks
0,0.333333,0.5,0.5
1,1.0,1.0,1.0
2,0.0,0.0,0.0
3,0.666667,0.75,0.8


In [17]:
# Number of simulated clients/devices in federated system
num_clients = 3

# List to store local datasets for each client
client_data = []

# Determine number of samples per client
data_per_client = len(X_scaled) // num_clients

# Split dataset into equal partitions
for i in range(num_clients):
    
    # Define start and end index for each client
    start = i * data_per_client
    end = (i + 1) * data_per_client
    
    # Assign local feature and target data to client
    X_client = X_scaled.iloc[start:end]
    y_client = y.iloc[start:end]
    
    # Store client dataset as tuple
    client_data.append((X_client, y_client))

print("Data successfully distributed to", num_clients, "clients")


Data successfully distributed to 3 clients


In [18]:
# Display the shape of each client's local dataset
# This ensures proper partitioning without data sharing

for i, (X_client, y_client) in enumerate(client_data):
    print(f"\nClient {i+1}")
    print("Local Feature Shape:", X_client.shape)
    print("Local Target Shape:", y_client.shape)



Client 1
Local Feature Shape: (1, 3)
Local Target Shape: (1,)

Client 2
Local Feature Shape: (1, 3)
Local Target Shape: (1,)

Client 3
Local Feature Shape: (1, 3)
Local Target Shape: (1,)
