In [67]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [68]:
train_data = pd.read_csv("Train_Data.csv")
test_data = pd.read_csv("Test_Data.csv")


In [69]:
# Checking the column names
print(train_data.columns)

Index(['date', 'number of rooms', 'security level of the community',
       'residence space', 'building space', 'noise level', 'waterfront',
       'view', 'air quality level', 'aboveground space ', 'basement space',
       'building year', 'decoration year', 'district', 'city', 'zip code',
       'region', 'exchange rate', 'unit price of residence space',
       'unit price of building space', 'total cost'],
      dtype='object')


In [70]:
 train_data[["unit price of residence space", "residence space", "unit price of building space", "building space", "exchange rate"]]


Unnamed: 0,unit price of residence space,residence space,unit price of building space,building space,exchange rate
0,11.886409,2820,0.977028,67518,6.784830
1,25.681414,1210,0.698603,9400,6.243129
2,16.921174,2200,0.238882,9397,6.010127
3,19.632230,1200,0.876178,9720,7.560375
4,28.205930,1370,0.132472,5858,6.543941
...,...,...,...,...,...
3995,22.244430,3840,0.573261,85728,6.242422
3996,51.508733,2310,0.527359,13430,6.742414
3997,39.224068,3360,0.360142,22111,6.289534
3998,46.203935,2640,0.493162,3680,7.108628


In [71]:
# Calculate total cost
train_data["total cost"] = (train_data["unit price of residence space"] * train_data["residence space"] + train_data["unit price of building space"] * train_data["building space"]) * train_data["exchange rate"]


In [72]:
# Dropping the columns "Unit price of residence space" and "Unit price of building space"
train_data = train_data.drop(["unit price of residence space", "unit price of building space"], axis=1)


In [73]:
# Define the price ranges based on the total cost column
price_ranges = [(0, 300000), (300000, 500000), (500000, 700000), (700000, float('inf'))]

In [74]:
# Define a function to assign each total cost to a price range
def assign_price_range(total_cost):
    for i in range(len(price_ranges)):
        if total_cost >= price_ranges[i][0] and total_cost < price_ranges[i][1]:
            return i + 1

In [75]:
# Create a new column in the train dataset to store the price ranges
train_data['price range'] = train_data['total cost'].apply(assign_price_range)

In [76]:
# Combine both train and test district columns to fit the encoder
combined_districts = pd.concat([train_data['district'], test_data['district']])

# Fit the label encoder on the combined districts
encoder = LabelEncoder()
encoder.fit(combined_districts)

# Now transform the district column in both datasets
train_data['district'] = encoder.transform(train_data['district'])
test_data['district'] = encoder.transform(test_data['district'])

In [77]:
# Repeat the process for the city column
encoder = LabelEncoder()
encoder.fit(combined_data['city'])

train_data['city'] = encoder.transform(train_data['city'])
test_data['city'] = encoder.transform(test_data['city'])

In [78]:
# Repeat the process for the zip code column
encoder = LabelEncoder()
encoder.fit(combined_data['zip code'])

train_data['zip code'] = encoder.transform(train_data['zip code'])
test_data['zip code'] = encoder.transform(test_data['zip code'])

In [79]:
# Split the train dataset into train and validation datasets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
# Define the features and target
features = ['number of rooms', 'security level of the community', 'residence space', 'building space', 'noise level', 'waterfront', 'view', 'air quality level', 'aboveground space ', 'basement space', 'building year', 'decoration year', 'district', 'city', 'zip code']
target = 'price range'

In [80]:
# Split the train_data into four parts for the clients
client_data = np.array_split(train_data, 4)

  return bound(*args, **kwds)


In [82]:
# Define a function to train the model on each client's data
def train_client_model(client_data):
    X = client_data[features]
    y = client_data[target]
    client_model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
    client_model.fit(X, y)
    return client_model

# Define a function to aggregate predictions (majority voting) from client models
def aggregate_predictions(client_models, X_val):
    # Collect predictions from all client models
    predictions = np.array([model.predict(X_val) for model in client_models])
    
    # Majority voting: take the mode (most frequent prediction) across models for each sample
    majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)
    return majority_votes

# Define a function to evaluate the aggregated predictions
def evaluate_aggregated_predictions(client_models, val_data):
    X_val = val_data[features]
    y_val = val_data[target]
    y_pred = aggregate_predictions(client_models, X_val)
    return accuracy_score(y_val, y_pred)

# Perform federated learning
num_rounds = 10

for i in range(num_rounds):
    print("Round:", i+1)
    
    # Train the model on each client's data and collect client models
    client_models = [train_client_model(client) for client in client_data]
    
    # Evaluate the aggregated predictions from client models
    accuracy = evaluate_aggregated_predictions(client_models, val_data)
    
    print(f"Round {i+1} Accuracy: {accuracy:.4f}")


Round: 1
Round 1 Accuracy: 0.6275
Round: 2
Round 2 Accuracy: 0.6275
Round: 3
Round 3 Accuracy: 0.6275
Round: 4
Round 4 Accuracy: 0.6275
Round: 5
Round 5 Accuracy: 0.6275
Round: 6
Round 6 Accuracy: 0.6275
Round: 7
Round 7 Accuracy: 0.6275
Round: 8
Round 8 Accuracy: 0.6275
Round: 9
Round 9 Accuracy: 0.6275
Round: 10
Round 10 Accuracy: 0.6275
