<a href="https://colab.research.google.com/github/kirank981/Project_space/blob/main/project_space.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing dependences


Install the necessary packages for PyTorch (torch and torchvision) and Flower (flwr) and pandas

In [2]:
!pip install -q flwr[simulation] torch torchvision matplotlib pandas

Import everything we need

In [21]:
from collections import OrderedDict
from typing import List, Tuple

from google.colab import drive
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import CIFAR10

from sklearn.model_selection import train_test_split  # Import the train_test_split function

import flwr as fl
from flwr.common import Metrics

DEVICE = torch.device("cpu")  # Try "cuda" to train on GPU
print(
    f"Training on {DEVICE} using PyTorch {torch.__version__} and Flower {fl.__version__}"
)

Training on cpu using PyTorch 2.0.1+cu118 and Flower 1.4.0


# Loading the data

Mounting drive

In [4]:
# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Setting the path to the location of the file

In [5]:
# Define the path to daily dataset folder
daily_dataset_path = Path('/content/drive/MyDrive/Federated learning implementation/dataset/dataset_archive/daily_dataset/daily_dataset')

# Define the path to daily dataset folder
weather_daily_dataset_path = Path('/content/drive/MyDrive/Federated learning implementation/dataset/dataset_archive/weather_daily_dataset.csv')


## Loading daily data
(of energy consumption)


In [6]:

# Initializing list to store dataframes
dfs = []

# Loop through the CSV files and reading them into dataframes
for i in range(111):
    filename = f'block_{i}.csv'
    df = pd.read_csv(daily_dataset_path / filename)
    dfs.append(df)

# Concatenating all the dataframes into a single dataframe
energy_daily_data = pd.concat(dfs, ignore_index=True)

# NRATM(not required at the moment)
# # Group the data by LCLid and create a dictionary of dataframes
# grouped_data = dict(tuple(energy_daily_data.groupby('LCLid')))


Loading data using file name

In [7]:
# Loading data from a specific CSV file
specific_file_data = pd.read_csv(daily_dataset_path / 'block_0.csv')

# Displaying data
print(specific_file_data)

           LCLid         day  energy_median  energy_mean  energy_max  \
0      MAC000002  2012-10-12         0.1385     0.154304       0.886   
1      MAC000002  2012-10-13         0.1800     0.230979       0.933   
2      MAC000002  2012-10-14         0.1580     0.275479       1.085   
3      MAC000002  2012-10-15         0.1310     0.213688       1.164   
4      MAC000002  2012-10-16         0.1450     0.203521       0.991   
...          ...         ...            ...          ...         ...   
25569  MAC005492  2014-02-24         0.1690     0.175042       0.378   
25570  MAC005492  2014-02-25         0.1550     0.160792       0.545   
25571  MAC005492  2014-02-26         0.1490     0.178542       0.687   
25572  MAC005492  2014-02-27         0.1140     0.146167       0.478   
25573  MAC005492  2014-02-28         0.0880     0.088000       0.088   

       energy_count  energy_std  energy_sum  energy_min  
0                46    0.196034       7.098       0.000  
1                48

### Loading required data
Creating a DataFrame that have only the required data

In [8]:
selected_column = ['LCLid','energy_sum','day']
energy_daily_selected=energy_daily_data[selected_column]
# print(energy_daily_selected)

# Group the data by LCLid and create a dictionary of dataframes, allowing to access each dataframe separately using the LCLid as the key
grouped_data_selected = dict(tuple(energy_daily_selected.groupby('LCLid')))
# Display the data for 'MAC000002'
print('\n')
print('Data report of MAC000002')
print(grouped_data_selected['MAC000002'])




Data report of MAC000002
         LCLid  energy_sum         day
0    MAC000002       7.098  2012-10-12
1    MAC000002      11.087  2012-10-13
2    MAC000002      13.223  2012-10-14
3    MAC000002      10.257  2012-10-15
4    MAC000002       9.769  2012-10-16
..         ...         ...         ...
500  MAC000002      12.528  2014-02-24
501  MAC000002      11.826  2014-02-25
502  MAC000002      12.328  2014-02-26
503  MAC000002      20.518  2014-02-27
504  MAC000002       1.387  2014-02-28

[505 rows x 3 columns]


## Loading daily weather data

Creating a 'day' column that stores only the date values from 'time' column
(for linking weather dataset 'day' with daily dataset 'day')

In [9]:
# Load the weather dataset into a DataFrame
weather_daily_data = pd.read_csv(weather_daily_dataset_path)

# Convert the 'time' column to datetime format
weather_daily_data['time'] = pd.to_datetime(weather_daily_data['time'])

# Calculate the mean temperature for each day and store it in a new column 'mean_temp'
weather_daily_data['mean_temp'] = (weather_daily_data['temperatureMax'] + weather_daily_data['temperatureMin']) / 2

# Print the updated DataFrame
print(weather_daily_data)

     temperatureMax   temperatureMaxTime  windBearing                 icon  \
0             11.96  2011-11-11 23:00:00          123                  fog   
1              8.59  2011-12-11 14:00:00          198    partly-cloudy-day   
2             10.33  2011-12-27 02:00:00          225    partly-cloudy-day   
3              8.07  2011-12-02 23:00:00          232                 wind   
4              8.22  2011-12-24 23:00:00          252  partly-cloudy-night   
..              ...                  ...          ...                  ...   
877            9.03  2014-01-26 16:00:00          233    partly-cloudy-day   
878           10.31  2014-02-27 14:00:00          224    partly-cloudy-day   
879           18.97  2014-03-09 14:00:00          172  partly-cloudy-night   
880            8.83  2014-02-12 16:00:00          210                 wind   
881            9.90  2014-02-15 12:00:00          233                 wind   

     dewPoint   temperatureMinTime  cloudCover  windSpeed  pres

### Loading required data
Creating a DataFrame that have only the required data

In [17]:
# Create a new DataFrame with selected columns
selected_columns = ['mean_temp', 'pressure', 'humidity', 'windSpeed', 'time']
weather_selected = weather_daily_data[selected_columns]

# Print the new dataset
print(weather_selected)

     mean_temp  pressure  humidity  windSpeed       time
0       10.405   1016.08      0.95       3.88 2011-11-11
1        5.535   1007.71      0.88       3.94 2011-12-11
2        9.180   1032.76      0.74       3.54 2011-12-27
3        5.315   1012.12      0.87       3.00 2011-12-02
4        5.695   1028.17      0.80       4.46 2011-12-24
..         ...       ...       ...        ...        ...
877      6.145   1002.10      0.79       4.55 2014-01-26
878      7.120   1007.02      0.74       4.14 2014-02-27
879     13.310   1022.44      0.58       2.78 2014-03-09
880      5.930    994.27      0.75       7.24 2014-02-12
881      7.640    988.63      0.69       9.96 2014-02-15

[882 rows x 5 columns]


# Dataset with Household energy consumption values and weather values

## For one household

In [18]:
# Convert 'time' column in weather_selected and 'day' column in grouped_data_selected to datetime objects
weather_selected['time'] = pd.to_datetime(weather_selected['time'])
grouped_data_selected['MAC000246']['day'] = pd.to_datetime(grouped_data_selected['MAC000002']['day'])

# Merge the datasets based on the common date values
merged_data = pd.merge(weather_selected, grouped_data_selected['MAC000002'], left_on='time', right_on='day', how='inner')

# Drop the redundant 'day' column from the merged dataset
merged_data.drop(columns=['day'], inplace=True)

# Display the merged dataset
print(merged_data)

     mean_temp  pressure  humidity  windSpeed       time      LCLid  \
0       10.000   1001.79      0.81       6.54 2012-11-25  MAC000002   
1       12.170   1008.74      0.90       5.74 2012-11-20  MAC000002   
2        7.830    979.63      0.85       4.07 2012-11-01  MAC000002   
3        5.295   1020.29      0.87       3.58 2012-11-06  MAC000002   
4        4.650   1001.72      0.80       5.63 2012-12-07  MAC000002   
..         ...       ...       ...        ...        ...        ...   
273      4.470   1001.76      0.91       1.52 2014-01-30  MAC000002   
274      6.145   1002.10      0.79       4.55 2014-01-26  MAC000002   
275      7.120   1007.02      0.74       4.14 2014-02-27  MAC000002   
276      5.930    994.27      0.75       7.24 2014-02-12  MAC000002   
277      7.640    988.63      0.69       9.96 2014-02-15  MAC000002   

     energy_sum  
0        10.545  
1        11.221  
2        12.209  
3        11.663  
4        13.248  
..          ...  
273      14.166  
274

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_selected['time'] = pd.to_datetime(weather_selected['time'])


## For all households

In [19]:
# Convert 'time' column in weather_selected to datetime objects
weather_selected['time'] = pd.to_datetime(weather_selected['time'])

# Create a list to store the merged data DataFrames
merged_data_list = []

# Iterate through each LCLid in energy_daily_selected
for lclid, data in grouped_data_selected.items():
    # Convert 'day' column in current LCLid data to datetime objects
    data['day'] = pd.to_datetime(data['day'])

    # Merge the current LCLid data with weather_selected based on the common date values
    merged_data_lclid = pd.merge(weather_selected, data, left_on='time', right_on='day', how='inner')

    # Drop the redundant 'day' column from the merged data
    merged_data_lclid.drop(columns=['day'], inplace=True)

    # Append the merged data to the merged_data_list
    merged_data_list.append(merged_data_lclid)

# Concatenate the merged data DataFrames in the list
merged_data = pd.concat(merged_data_list, ignore_index=True)

# Display the merged dataset
print(merged_data)






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_selected['time'] = pd.to_datetime(weather_selected['time'])


         mean_temp  pressure  humidity  windSpeed       time      LCLid  \
0           10.000   1001.79      0.81       6.54 2012-11-25  MAC000002   
1           12.170   1008.74      0.90       5.74 2012-11-20  MAC000002   
2            7.830    979.63      0.85       4.07 2012-11-01  MAC000002   
3            5.295   1020.29      0.87       3.58 2012-11-06  MAC000002   
4            4.650   1001.72      0.80       5.63 2012-12-07  MAC000002   
...            ...       ...       ...        ...        ...        ...   
1541994      4.470   1001.76      0.91       1.52 2014-01-30  MAC005567   
1541995      6.145   1002.10      0.79       4.55 2014-01-26  MAC005567   
1541996      7.120   1007.02      0.74       4.14 2014-02-27  MAC005567   
1541997      5.930    994.27      0.75       7.24 2014-02-12  MAC005567   
1541998      7.640    988.63      0.69       9.96 2014-02-15  MAC005567   

         energy_sum  
0            10.545  
1            11.221  
2            12.209  
3          

Storing the dataset in a .csv file

# Splitting dataset to training and testing sets



Counting amount of data for each LCLid (data available for each household)

In [25]:


# Convert 'time' column in weather_selected to datetime objects
weather_selected['time'] = pd.to_datetime(weather_selected['time'])

# Merge the data into merged_data DataFrame as described in your previous code

# Count the number of data rows for each LCLid
lclid_data_counts = merged_data['LCLid'].value_counts()

# Display the counts for each LCLid
print("Data row counts for each LCLid:")
print(lclid_data_counts)


Data row counts for each LCLid:
MAC000145    402
MAC000147    402
MAC000148    402
MAC000149    402
MAC000150    402
            ... 
MAC005560      1
MAC001150      1
MAC005563      1
MAC001957      1
MAC005565      1
Name: LCLid, Length: 5549, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_selected['time'] = pd.to_datetime(weather_selected['time'])


Identifiting the no of households with insufficient amount of data

In [30]:
# Count the number of data rows for each LCLid
lclid_data_counts = merged_data['LCLid'].value_counts()


# Get the total number of unique LCLid values
total_lclids = len(lclid_data_counts)

# Count the number of LCLid values with less than 100 data rows
count_less_than_100 = (lclid_data_counts < 100).sum()

# Display the count of LCLid values with less than 100 data rows
print("Number of LCLid values with less than 100 data rows:", count_less_than_100)

# Display the total number of unique LCLid values
print("Total number of unique LCLid values:", total_lclids)

Number of LCLid values with less than 50 data rows: 61
Total number of unique LCLid values: 5549


Removing the households with less data, from the dataset

In [32]:
# Get the list of LCLid values with less than 100 data rows
lclids_to_remove = lclid_data_counts[lclid_data_counts < 100].index

# Remove rows corresponding to LCLid values with less than 100 data rows
filtered_data = merged_data[~merged_data['LCLid'].isin(lclids_to_remove)]

# Display the filtered data
print(filtered_data)

         mean_temp  pressure  humidity  windSpeed       time      LCLid  \
0           10.000   1001.79      0.81       6.54 2012-11-25  MAC000002   
1           12.170   1008.74      0.90       5.74 2012-11-20  MAC000002   
2            7.830    979.63      0.85       4.07 2012-11-01  MAC000002   
3            5.295   1020.29      0.87       3.58 2012-11-06  MAC000002   
4            4.650   1001.72      0.80       5.63 2012-12-07  MAC000002   
...            ...       ...       ...        ...        ...        ...   
1541994      4.470   1001.76      0.91       1.52 2014-01-30  MAC005567   
1541995      6.145   1002.10      0.79       4.55 2014-01-26  MAC005567   
1541996      7.120   1007.02      0.74       4.14 2014-02-27  MAC005567   
1541997      5.930    994.27      0.75       7.24 2014-02-12  MAC005567   
1541998      7.640    988.63      0.69       9.96 2014-02-15  MAC005567   

         energy_sum  
0            10.545  
1            11.221  
2            12.209  
3          

Splitting the dataset to train and test where the split ratio(70:30) is made with every househould's data.

In [40]:
# Define the split percentages
train_percentage = 0.8 # 80% for training, 30% for testing
min_data_points = 10  # Minimum number of data points required for an LCLid

# Create a list to store DataFrames for training and testing
train_data_list = []
test_data_list = []

# Iterate through each unique LCLid and split the data based on train_percentage
unique_lclids = filtered_data['LCLid'].unique()
for lclid in unique_lclids:
    lclid_data = filtered_data[filtered_data['LCLid'] == lclid]

    # Check if there are sufficient data points for the current LCLid
    if len(lclid_data) >= min_data_points:
        # Split the data for the current LCLid into training and test sets
        train_data_lclid, test_data_lclid = train_test_split(lclid_data, train_size=train_percentage, shuffle=False)

        # Randomize the rows within each subset
        train_data_lclid = train_data_lclid.sample(frac=1, random_state=42)
        test_data_lclid = test_data_lclid.sample(frac=1, random_state=42)

        # Append to the train_data_list and test_data_list
        train_data_list.append(train_data_lclid)
        test_data_list.append(test_data_lclid)

# Concatenate the DataFrames in the lists
FL_train_set = pd.concat(train_data_list, ignore_index=True)
FL_test_set = pd.concat(test_data_list, ignore_index=True)

# Store the training and test sets in separate lists
FL_train_set_list = train_data_list
FL_test_sets_list = test_data_list

# Display the training and test sets
print("Training set:")
print(FL_train_set)
print("Test set:")
print(FL_test_set)

Training set:
         mean_temp  pressure  humidity  windSpeed       time      LCLid  \
0           10.125   1024.17      0.87       3.71 2013-01-08  MAC005417   
1            6.135    985.82      0.92       4.22 2012-12-14  MAC005417   
2            5.295   1020.29      0.87       3.58 2012-11-06  MAC005417   
3            2.100   1007.72      0.82       4.23 2013-01-23  MAC005417   
4            6.840   1000.02      0.87       4.40 2014-01-05  MAC005417   
...            ...       ...       ...        ...        ...        ...   
1228789      9.935    999.12      0.91       4.28 2012-12-24  MAC002462   
1228790      8.245   1004.48      0.84       2.37 2012-10-30  MAC002462   
1228791      1.215    999.17      0.93       3.36 2013-02-11  MAC002462   
1228792     -0.405    994.61      0.90       1.22 2013-01-21  MAC002462   
1228793     11.210   1003.59      0.86       5.51 2012-12-23  MAC002462   

         energy_sum  
0            12.877  
1            14.770  
2             2.042

In [47]:
# Display the first training set
print("First Training set:")
print(FL_train_set_list[0])
print("First Test set:")
print(FL_test_sets_list[0])

First Training set:
         mean_temp  pressure  humidity  windSpeed       time      LCLid  \
1502740     10.125   1024.17      0.87       3.71 2013-01-08  MAC005417   
1502625      6.135    985.82      0.92       4.22 2012-12-14  MAC005417   
1502542      5.295   1020.29      0.87       3.58 2012-11-06  MAC005417   
1502674      2.100   1007.72      0.82       4.23 2013-01-23  MAC005417   
1502787      6.840   1000.02      0.87       4.40 2014-01-05  MAC005417   
...            ...       ...       ...        ...        ...        ...   
1502696      5.080   1038.71      0.72       2.17 2013-11-26  MAC005417   
1502587      8.350   1020.98      0.92       3.06 2012-11-12  MAC005417   
1502774      9.230    986.81      0.75       7.86 2013-12-27  MAC005417   
1502734      8.165   1025.35      0.83       4.56 2013-12-08  MAC005417   
1502560      9.975   1026.48      0.86       1.86 2012-11-14  MAC005417   

         energy_sum  
1502740      12.877  
1502625      14.770  
1502542      

Checking the percentage split with training and test sets

In [46]:
# Calculate the number of rows in the training and test sets
num_rows_train = len(FL_train_set_list[0])
num_rows_test = len(FL_test_sets_list[0])

# Calculate the ratio
ratio = num_rows_train / num_rows_test

print(f"Number of rows in training set: {num_rows_train}")
print(f"Number of rows in test set: {num_rows_test}")
print(f"Training to test ratio: {ratio:.2f}")

Number of rows in training set: 210
Number of rows in test set: 53
Training to test ratio: 3.96


In [35]:
# Randomize the rows of filtered_data
filtered_data = filtered_data.sample(frac=1, random_state=42)

# Define the split percentages
train_percentage = 0.7  # 70% for training, 30% for testing

# Split the data into training and test sets
Central_train_data, Central_test_data = train_test_split(filtered_data, train_size=train_percentage, random_state=42)

# Display the training and test sets
print("Training set:")
print(Central_train_data)
print("Test set:")
print(Central_test_data)

Training set:
         mean_temp  pressure  humidity  windSpeed       time      LCLid  \
1144520      2.630   1018.79      0.86       0.68 2013-02-09  MAC004202   
1279662      6.915   1023.52      0.70       1.11 2013-02-16  MAC004663   
562072       7.760   1032.19      0.96       2.23 2013-01-06  MAC002020   
1412775      5.490   1021.19      0.90       0.93 2013-01-09  MAC005119   
288647       2.825   1015.57      0.67       1.81 2013-03-14  MAC000974   
...            ...       ...       ...        ...        ...        ...   
687152       7.760   1022.69      0.92       0.62 2012-11-15  MAC002497   
1364493      0.845   1027.42      0.96       0.93 2012-02-05  MAC004953   
223876       3.520   1015.47      0.71       4.97 2013-02-02  MAC000731   
1325386      7.755    997.84      0.91       2.38 2012-11-10  MAC004835   
462361      12.360   1016.56      0.89       4.42 2013-10-31  MAC001640   

         energy_sum  
1144520       4.716  
1279662       0.903  
562072       11.490