In [11]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import utils.KEprocessing

In [12]:
# Define path to the CSV file
csv_path = 'data/combined_df_noNA.csv'

# Check if the file exists
if os.path.exists(csv_path):
    # If the file exists, read the DataFrame from the CSV
    combined_df = pd.read_csv(csv_path)
else:
    # If the file does not exist, call the function to prepare the DataFrame
    combined_df = utils.KEprocessing.prepare_final_df()

In [13]:
combined_df

Unnamed: 0.1,Unnamed: 0,CUSTOMER,AREA,ISPRIVATEPERSON,DATE,HOUR_0,HOUR_1,HOUR_2,HOUR_3,HOUR_4,...,HOUR_17,HOUR_18,HOUR_19,HOUR_20,HOUR_21,HOUR_22,HOUR_23,YEAR,One_Day_Power,One_Day_Power_NaN
0,0,1060598736,Kvarnholmen,Nej,2020-01-01,0.0112,0.0124,0.0120,0.0116,0.0128,...,0.0104,0.0100,0.0108,0.0108,0.0108,0.0100,0.0100,2020,0.2604,0
1,1,1060598736,Kvarnholmen,Nej,2020-01-02,0.0100,0.0092,0.0092,0.0088,0.0100,...,0.0100,0.0100,0.0092,0.0112,0.0112,0.0112,0.0100,2020,0.2296,0
2,2,1060598736,Kvarnholmen,Nej,2020-01-03,0.0112,0.0108,0.0108,0.0100,0.0100,...,0.0108,0.0112,0.0112,0.0108,0.0104,0.0104,0.0112,2020,0.2452,0
3,3,1060598736,Kvarnholmen,Nej,2020-01-04,0.0100,0.0088,0.0092,0.0092,0.0096,...,0.0116,0.0084,0.0092,0.0108,0.0116,0.0108,0.0112,2020,0.2424,0
4,4,1060598736,Kvarnholmen,Nej,2020-01-05,0.0112,0.0112,0.0104,0.0112,0.0112,...,0.0116,0.0108,0.0120,0.0112,0.0108,0.0112,0.0108,2020,0.2648,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3794513,3796402,2411372971,Malmen,Nej,2023-12-27,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2023,0.0000,0
3794514,3796403,2411372971,Malmen,Nej,2023-12-28,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2023,0.0000,0
3794515,3796404,2411372971,Malmen,Nej,2023-12-29,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2023,0.0000,0
3794516,3796405,2411372971,Malmen,Nej,2023-12-30,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2023,0.0000,0


# Merge power dataset, weather dataset, and price dataset and save it in final_df.csv

# 1) First reshape the power dataframe

In [14]:
import pandas as pd

# Assuming combined_df is already loaded and has the columns 'DATE', 'HOUR_0', 'HOUR_1', ..., 'HOUR_23'
# Convert the DATE column to datetime data type
combined_df['DATE'] = pd.to_datetime(combined_df['DATE'])

# Melt the hourly columns into two new columns 'Time' and 'Power_Consumption'
combined_df_long = combined_df.melt(id_vars=['CUSTOMER', 'AREA', 'ISPRIVATEPERSON', 'DATE', 'YEAR', 'One_Day_Power', 'One_Day_Power_NaN'],
                                    value_vars=[f'HOUR_{i}' for i in range(24)],
                                    var_name='Time',
                                    value_name='Power_Consumption')

# Convert the 'Time' column to represent the actual time of day
combined_df_long['Time'] = combined_df_long['Time'].str.extract('(\d+)').astype(int)
combined_df_long['Time'] = pd.to_timedelta(combined_df_long['Time'], unit='h')

# Create 'DateTime' by adding 'Time' to 'DATE'
combined_df_long['DateTime'] = combined_df_long['DATE'] + combined_df_long['Time']

# Convert the DateTime column to datetime data type
combined_df_long['DateTime'] = pd.to_datetime(combined_df_long['DateTime'])

# Now you can drop the 'DATE' and 'Time' columns or other columns if they are no longer needed
combined_df_long = combined_df_long.drop(['DATE', 'Time','YEAR',], axis=1)

# Reorder columns to have 'DateTime' at the front if desired
combined_df_long = combined_df_long[['DateTime', 'CUSTOMER', 'AREA', 'ISPRIVATEPERSON', 'Power_Consumption','One_Day_Power','One_Day_Power_NaN']]

# Sort the DataFrame by the 'DateTime' column to ensure that the dates and times are in order
combined_df_long.sort_values(by='DateTime', inplace=True)

# Reset the index of the DataFrame after sorting
combined_df_long.reset_index(drop=True, inplace=True)

# Check the result
print(combined_df_long.head())

    DateTime    CUSTOMER         AREA ISPRIVATEPERSON  Power_Consumption  \
0 2020-01-01  1060598736  Kvarnholmen             Nej           0.011200   
1 2020-01-01  1060753918       Malmen             Nej           0.001797   
2 2020-01-01  1060753924       Malmen             Nej           0.021600   
3 2020-01-01  1060753932       Malmen             Nej           0.004122   
4 2020-01-01  1060753945       Malmen             Nej           0.001120   

   One_Day_Power  One_Day_Power_NaN  
0       0.260400                  0  
1       0.054926                  0  
2       0.564000                  0  
3       0.107658                  0  
4       0.027033                  0  


In [15]:
combined_df_long

Unnamed: 0,DateTime,CUSTOMER,AREA,ISPRIVATEPERSON,Power_Consumption,One_Day_Power,One_Day_Power_NaN
0,2020-01-01 00:00:00,1060598736,Kvarnholmen,Nej,0.011200,0.260400,0
1,2020-01-01 00:00:00,1060753918,Malmen,Nej,0.001797,0.054926,0
2,2020-01-01 00:00:00,1060753924,Malmen,Nej,0.021600,0.564000,0
3,2020-01-01 00:00:00,1060753932,Malmen,Nej,0.004122,0.107658,0
4,2020-01-01 00:00:00,1060753945,Malmen,Nej,0.001120,0.027033,0
...,...,...,...,...,...,...,...
91068427,2023-12-31 23:00:00,1060619147,Malmen,Ja,0.000101,0.003008,0
91068428,2023-12-31 23:00:00,1060619139,Malmen,Nej,0.000028,0.000687,0
91068429,2023-12-31 23:00:00,1060619133,Malmen,Ja,0.000071,0.004211,0
91068430,2023-12-31 23:00:00,1060619347,Malmen,Ja,0.000376,0.017268,0


In [16]:
combined_df_long.to_csv('data/combined_df_long.csv', index=False)

In [17]:
print(f"There are {combined_df.shape[0]} rows in the combined_df." )
print(f"There are {combined_df_long.shape[0]} rows in the combined_df_long.")

There are 3794518 rows in the combined_df.
There are 91068432 rows in the combined_df_long.


# Check if it is done correctly

In [19]:
import pandas as pd

# Filter the records for user 1060598736
user_records = combined_df[combined_df['CUSTOMER'] == 1060598736]

# Display the records for the user
print(user_records)

         Unnamed: 0    CUSTOMER         AREA ISPRIVATEPERSON       DATE  \
0                 0  1060598736  Kvarnholmen             Nej 2020-01-01   
1                 1  1060598736  Kvarnholmen             Nej 2020-01-02   
2                 2  1060598736  Kvarnholmen             Nej 2020-01-03   
3                 3  1060598736  Kvarnholmen             Nej 2020-01-04   
4                 4  1060598736  Kvarnholmen             Nej 2020-01-05   
...             ...         ...          ...             ...        ...   
2540418     2542307  1060598736  Kvarnholmen             Nej 2023-12-27   
2540419     2542308  1060598736  Kvarnholmen             Nej 2023-12-28   
2540420     2542309  1060598736  Kvarnholmen             Nej 2023-12-29   
2540421     2542310  1060598736  Kvarnholmen             Nej 2023-12-30   
2540422     2542311  1060598736  Kvarnholmen             Nej 2023-12-31   

         HOUR_0  HOUR_1  HOUR_2  HOUR_3  HOUR_4  ...  HOUR_17  HOUR_18  \
0        0.0112  0.0124  

In [20]:
import pandas as pd

# Filter the records for user 1060598736
user_records = combined_df_long[combined_df_long['CUSTOMER'] == 1060598736]

# Display the records for the user
print(user_records)

                    DateTime    CUSTOMER         AREA ISPRIVATEPERSON  \
0        2020-01-01 00:00:00  1060598736  Kvarnholmen             Nej   
1229     2020-01-01 01:00:00  1060598736  Kvarnholmen             Nej   
1971     2020-01-01 02:00:00  1060598736  Kvarnholmen             Nej   
2474     2020-01-01 03:00:00  1060598736  Kvarnholmen             Nej   
3453     2020-01-01 04:00:00  1060598736  Kvarnholmen             Nej   
...                      ...         ...          ...             ...   
91038853 2023-12-31 19:00:00  1060598736  Kvarnholmen             Nej   
91045680 2023-12-31 20:00:00  1060598736  Kvarnholmen             Nej   
91052507 2023-12-31 21:00:00  1060598736  Kvarnholmen             Nej   
91059334 2023-12-31 22:00:00  1060598736  Kvarnholmen             Nej   
91066031 2023-12-31 23:00:00  1060598736  Kvarnholmen             Nej   

          Power_Consumption  One_Day_Power  One_Day_Power_NaN  
0                    0.0112         0.2604                 

In [21]:
print("New DataFrame rows:", combined_df_long.shape[0])
print("Expected New DataFrame rows:", combined_df.shape[0] * 24)

New DataFrame rows: 91068432
Expected New DataFrame rows: 91068432


In [22]:
print("NaN in DateTime:", combined_df_long['DateTime'].isnull().sum())
print("NaN in Power_Consumption:", combined_df_long['Power_Consumption'].isnull().sum())

NaN in DateTime: 0
NaN in Power_Consumption: 0


In [23]:
unique_dates_hours = combined_df_long['DateTime'].nunique()
print("Unique date-hour combinations:", unique_dates_hours)

Unique date-hour combinations: 35064


# 2) Merge the reshaped power dataframe with weather dataframe and price dataframe

## power dataframe: combined_df_long.csv -> power_df
## weather dataframe: final_combined_weather_data.csv -> weather_df
## price dataframe: combined_prices_df.csv -> price_df

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
# For the power dataset
filename = 'data/combined_df_long.csv'
power_df = pd.read_csv(filename, sep=',')

In [20]:
power_df

Unnamed: 0,DateTime,CUSTOMER,AREA,ISPRIVATEPERSON,Power_Consumption,One_Day_Power,One_Day_Power_NaN
0,2020-01-01 00:00:00,1060598736,Kvarnholmen,Nej,0.011200,0.260400,0
1,2020-01-01 00:00:00,1060753918,Malmen,Nej,0.001797,0.054926,0
2,2020-01-01 00:00:00,1060753924,Malmen,Nej,0.021600,0.564000,0
3,2020-01-01 00:00:00,1060753932,Malmen,Nej,0.004122,0.107658,0
4,2020-01-01 00:00:00,1060753945,Malmen,Nej,0.001120,0.027033,0
...,...,...,...,...,...,...,...
91068427,2023-12-31 23:00:00,1060619147,Malmen,Ja,0.000101,0.003008,0
91068428,2023-12-31 23:00:00,1060619139,Malmen,Nej,0.000028,0.000687,0
91068429,2023-12-31 23:00:00,1060619133,Malmen,Ja,0.000071,0.004211,0
91068430,2023-12-31 23:00:00,1060619347,Malmen,Ja,0.000376,0.017268,0


In [13]:

# For the weather dataset
filename = 'data/final_combined_weather_data.csv'
weather_df = pd.read_csv(filename, sep=';')

In [14]:
weather_df

Unnamed: 0,DateTime,Air Temperature 1 Min_P19,Air Temperature 2 Max_P27,Air Temperature 2 Min_P26,Dew Point Temperature_P39,Precipitation_P7,Relative Humidity_P6,Wind Speed_P4
0,2020-01-01 00:00:00,,,,-2.4,,80.0,5.0
1,2020-01-01 01:00:00,,,,-2.3,,84.0,3.0
2,2020-01-01 02:00:00,,,,-0.9,,90.0,4.0
3,2020-01-01 03:00:00,,,,-0.4,,90.0,5.0
4,2020-01-01 04:00:00,,,,0.1,,95.0,4.0
...,...,...,...,...,...,...,...,...
33503,2023-12-01 02:00:00,,,,-2.0,,98.0,6.0
33504,2023-12-01 03:00:00,,,,-2.0,,98.0,6.0
33505,2023-12-01 04:00:00,,,,-2.0,,98.0,6.0
33506,2023-12-01 05:00:00,,,,-2.2,,97.0,6.0


In [15]:
# For the price dataset
filename = 'data/combined_prices_df.csv'
price_df = pd.read_csv(filename, sep=',', index_col=0)

In [16]:
price_df

Unnamed: 0,DateTime,Price
0,2020-01-01 00:00:00,28.45
1,2020-01-01 01:00:00,27.90
2,2020-01-01 02:00:00,27.52
3,2020-01-01 03:00:00,27.54
4,2020-01-01 04:00:00,26.55
...,...,...
35059,2023-12-31 19:00:00,34.89
35060,2023-12-31 20:00:00,29.60
35061,2023-12-31 21:00:00,28.67
35062,2023-12-31 22:00:00,26.87


# 3) Merge

In [21]:
from functools import reduce

# List of DataFrames to merge
dataframes = [power_df, weather_df, price_df]

# Merge all DataFrames on 'DateTime'
final_combined_df = reduce(lambda left, right: pd.merge(left, right, on='DateTime', how='outer'), dataframes)

In [22]:
final_combined_df

Unnamed: 0,DateTime,CUSTOMER,AREA,ISPRIVATEPERSON,Power_Consumption,One_Day_Power,One_Day_Power_NaN,Air Temperature 1 Min_P19,Air Temperature 2 Max_P27,Air Temperature 2 Min_P26,Dew Point Temperature_P39,Precipitation_P7,Relative Humidity_P6,Wind Speed_P4,Price
0,2020-01-01 00:00:00,1060598736,Kvarnholmen,Nej,0.011200,0.260400,0,,,,-2.4,,80.0,5.0,28.45
1,2020-01-01 00:00:00,1060753918,Malmen,Nej,0.001797,0.054926,0,,,,-2.4,,80.0,5.0,28.45
2,2020-01-01 00:00:00,1060753924,Malmen,Nej,0.021600,0.564000,0,,,,-2.4,,80.0,5.0,28.45
3,2020-01-01 00:00:00,1060753932,Malmen,Nej,0.004122,0.107658,0,,,,-2.4,,80.0,5.0,28.45
4,2020-01-01 00:00:00,1060753945,Malmen,Nej,0.001120,0.027033,0,,,,-2.4,,80.0,5.0,28.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91068427,2023-12-31 23:00:00,1060619147,Malmen,Ja,0.000101,0.003008,0,,,,,,,,29.56
91068428,2023-12-31 23:00:00,1060619139,Malmen,Nej,0.000028,0.000687,0,,,,,,,,29.56
91068429,2023-12-31 23:00:00,1060619133,Malmen,Ja,0.000071,0.004211,0,,,,,,,,29.56
91068430,2023-12-31 23:00:00,1060619347,Malmen,Ja,0.000376,0.017268,0,,,,,,,,29.56


In [23]:
final_combined_df.to_csv('data/final_df.csv')

# 4) Read final df to see the number(percentage) of rows which has Nan values due to the weather parameters (get statisctics for doing later PCA)

# 

In [1]:
import pandas as pd
# For the final df
filename = 'data/final_df.csv'
final_df = pd.read_csv(filename, sep=',', index_col=0)

In [2]:
# Check for NaN values and sum them up for each column
nan_counts = final_df.isna().sum()

# Display the count of NaN values for each column
print(nan_counts)

# If you want to see a percentage of missing values for each column, you can do:
nan_percentage = (final_df.isna().sum() / len(final_df)) * 100
print(nan_percentage)

DateTime                            0
CUSTOMER                            0
AREA                                0
ISPRIVATEPERSON                     0
Power_Consumption                   0
One_Day_Power                       0
One_Day_Power_NaN                   0
Air Temperature 1 Min_P19    87485460
Air Temperature 2 Max_P27    84098357
Air Temperature 2 Min_P26    84098357
Dew Point Temperature_P39     8017478
Precipitation_P7             90674338
Relative Humidity_P6          8024143
Wind Speed_P4                 7021779
Price                               0
dtype: int64
DateTime                      0.000000
CUSTOMER                      0.000000
AREA                          0.000000
ISPRIVATEPERSON               0.000000
Power_Consumption             0.000000
One_Day_Power                 0.000000
One_Day_Power_NaN             0.000000
Air Temperature 1 Min_P19    96.065627
Air Temperature 2 Max_P27    92.346332
Air Temperature 2 Min_P26    92.346332
Dew Point Temperature_P39  

In [3]:
# To find out how many rows contain at least one NaN value:
rows_with_nan = final_df.isna().any(axis=1).sum()
print(f"\nTotal number of rows containing at least one NaN value: {rows_with_nan}")


Total number of rows containing at least one NaN value: 91057066


In [4]:

# Define the columns you are interested in
selected_columns = ['Power_Consumption', 'Dew Point Temperature_P39', 'Relative Humidity_P6', 'Wind Speed_P4', 'Price']

# Check for NaN values and sum them up for each column in the selected columns
nan_counts_selected = final_df[selected_columns].isna().sum()

# Display the count of NaN values for each selected column
print("NaN counts for each selected column:")
print(nan_counts_selected)

# If you want to see a percentage of missing values for each of the selected columns, you can do:
nan_percentage_selected = (final_df[selected_columns].isna().sum() / len(final_df)) * 100
print("\nPercentage of NaN values for each selected column:")
print(nan_percentage_selected)

# To find out how many rows contain at least one NaN value in the selected columns:
rows_with_nan_selected = final_df[selected_columns].isna().any(axis=1).sum()
print(f"\nTotal number of rows containing at least one NaN value in selected columns: {rows_with_nan_selected}")



NaN counts for each selected column:
Power_Consumption                   0
Dew Point Temperature_P39     8017478
Precipitation_P7             90674338
Relative Humidity_P6          8024143
Wind Speed_P4                 7021779
Price                               0
dtype: int64

Percentage of NaN values for each selected column:
Power_Consumption             0.000000
Dew Point Temperature_P39     8.803795
Precipitation_P7             99.567255
Relative Humidity_P6          8.811114
Wind Speed_P4                 7.710442
Price                         0.000000
dtype: float64

Total number of rows containing at least one NaN value in selected columns: 90719584


In [5]:
for column in ['Dew Point Temperature_P39', 'Relative Humidity_P6', 'Wind Speed_P4']:
    final_df[column] = final_df[column].fillna(final_df[column].median())

In [6]:
# Display the count of NaN values for each selected column
print("NaN counts for each selected column:")
print(nan_counts_selected)

# If you want to see a percentage of missing values for each of the selected columns, you can do:
nan_percentage_selected = (final_df[selected_columns].isna().sum() / len(final_df)) * 100
print("\nPercentage of NaN values for each selected column:")
print(nan_percentage_selected)

# To find out how many rows contain at least one NaN value in the selected columns:
rows_with_nan_selected = final_df[selected_columns].isna().any(axis=1).sum()
print(f"\nTotal number of rows containing at least one NaN value in selected columns: {rows_with_nan_selected}")

NaN counts for each selected column:
Power_Consumption                   0
Dew Point Temperature_P39     8017478
Precipitation_P7             90674338
Relative Humidity_P6          8024143
Wind Speed_P4                 7021779
Price                               0
dtype: int64

Percentage of NaN values for each selected column:
Power_Consumption             0.000000
Dew Point Temperature_P39     0.000000
Precipitation_P7             99.567255
Relative Humidity_P6          0.000000
Wind Speed_P4                 0.000000
Price                         0.000000
dtype: float64

Total number of rows containing at least one NaN value in selected columns: 90674338
