In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('data/commercial_df_2021.csv', index_col=0) # Remove the unnamed column using index_col=0 instead of the command #df = df.iloc[:, 1:]

# Drop columns that are not needed for training
columns_to_drop = ['One_Day_Power', 'One_Day_Power_NaN', 'Air Temperature 1 Min_P19','Air Temperature 2 Max_P27', 'Air Temperature 2 Min_P26', 'Precipitation_P7']
df = df.drop(columns=columns_to_drop)

# Rename columns
df = df.rename(columns={'Dew Point Temperature_P39': 'Temperature', 
                         'Relative Humidity_P6': 'Humidity',
                         'Wind Speed_P4': 'WindSpeed',
                         'Power_Consumption': 'PowerConsumption'
                         # Add more renaming mappings as needed
                        })

# Add an ID column
df.insert(0, 'ID', range(1, len(df) + 1))  # Insert ID column as the first column

# Remove the first unnamed column
#df = df.iloc[:, 1:]

# Write the DataFrame back to a new CSV file without the first unnamed column
#df.to_csv('data/commercial_df_2021_new.csv', index=False)


In [2]:
df

Unnamed: 0,ID,DateTime,CUSTOMER,AREA,ISPRIVATEPERSON,PowerConsumption,Temperature,Humidity,WindSpeed,Price
6944448,1,2021-01-01 00:00:00,1060753805,Kvarnholmen,Nej,0.001000,4.8,88.0,3.0,24.35
6944449,2,2021-01-01 00:00:00,1060766019,Malmen,Nej,0.003267,4.8,88.0,3.0,24.35
6944451,3,2021-01-01 00:00:00,1060616621,Malmen,Nej,0.007463,4.8,88.0,3.0,24.35
6944452,4,2021-01-01 00:00:00,1060621516,Malmen,Nej,0.000147,4.8,88.0,3.0,24.35
6944453,5,2021-01-01 00:00:00,1060601163,Kvarnholmen,Nej,0.004140,4.8,88.0,3.0,24.35
...,...,...,...,...,...,...,...,...,...,...
14619619,5863076,2021-12-31 23:00:00,1060598838,Kvarnholmen,Nej,0.000558,5.3,86.0,5.0,46.60
14619620,5863077,2021-12-31 23:00:00,1060598788,Kvarnholmen,Nej,0.000424,5.3,86.0,5.0,46.60
14619621,5863078,2021-12-31 23:00:00,1060604204,Kvarnholmen,Nej,0.000432,5.3,86.0,5.0,46.60
14619622,5863079,2021-12-31 23:00:00,1060599041,Kvarnholmen,Nej,0.005087,5.3,86.0,5.0,46.60


In [3]:
df.shape[0]

5863080

# Make hourly aggregation of data, expected 365*24 = 8760 data points

In [4]:
df['DateTime'] = pd.to_datetime(df['DateTime'])
df.set_index('DateTime', inplace=True)

In [5]:
commercial_df_2021_resampled = df.resample('H').agg({'PowerConsumption': 'sum', 'Price': 'mean', 'Temperature':'mean'})

  commercial_df_2021_resampled = df.resample('H').agg({'PowerConsumption': 'sum', 'Price': 'mean', 'Temperature':'mean'})


In [6]:
commercial_df_2021_resampled

Unnamed: 0_level_0,PowerConsumption,Price,Temperature
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-01 00:00:00,4.434627,24.35,4.8
2021-01-01 01:00:00,4.375336,23.98,4.8
2021-01-01 02:00:00,4.413750,23.72,4.8
2021-01-01 03:00:00,4.368521,23.73,4.8
2021-01-01 04:00:00,4.421918,24.06,4.8
...,...,...,...
2021-12-31 19:00:00,5.475141,33.96,6.6
2021-12-31 20:00:00,5.303456,33.08,6.0
2021-12-31 21:00:00,5.109125,32.34,5.8
2021-12-31 22:00:00,4.861598,29.76,5.5


In [7]:
# Reset index to make DateTime a column again
commercial_df_2021_resampled.reset_index(inplace=True)

# Assign new IDs
commercial_df_2021_resampled['ID'] = range(1, len(commercial_df_2021_resampled) + 1)



In [8]:
commercial_df_2021_resampled

Unnamed: 0,DateTime,PowerConsumption,Price,Temperature,ID
0,2021-01-01 00:00:00,4.434627,24.35,4.8,1
1,2021-01-01 01:00:00,4.375336,23.98,4.8,2
2,2021-01-01 02:00:00,4.413750,23.72,4.8,3
3,2021-01-01 03:00:00,4.368521,23.73,4.8,4
4,2021-01-01 04:00:00,4.421918,24.06,4.8,5
...,...,...,...,...,...
8755,2021-12-31 19:00:00,5.475141,33.96,6.6,8756
8756,2021-12-31 20:00:00,5.303456,33.08,6.0,8757
8757,2021-12-31 21:00:00,5.109125,32.34,5.8,8758
8758,2021-12-31 22:00:00,4.861598,29.76,5.5,8759


In [None]:
# Save DataFrame to CSV file
commercial_df_2021_resampled.to_csv('data/commercial_df_2021_resampled_vertexai.csv', index=False, columns=['ID', 'DateTime', 'PowerConsumption', 'Price', 'Temperature'])

# Make daily aggregation of data (expected 365 data points (days))

In [9]:
commercial_df_2021_resampled = df.resample('D').agg({'PowerConsumption': 'sum', 'Price': 'mean', 'Temperature':'mean'})

In [10]:
# Reset index to make DateTime a column again
commercial_df_2021_resampled.reset_index(inplace=True)

# Assign new IDs
commercial_df_2021_resampled['ID'] = range(1, len(commercial_df_2021_resampled) + 1)

In [11]:
commercial_df_2021_resampled

Unnamed: 0,DateTime,PowerConsumption,Price,Temperature,ID
0,2021-01-01,129.406048,36.262917,4.133333,1
1,2021-01-02,130.231185,43.159167,1.345833,2
2,2021-01-03,127.118719,27.283333,0.658333,3
3,2021-01-04,151.340661,44.015833,-0.304167,4
4,2021-01-05,147.821185,47.370833,-0.379167,5
...,...,...,...,...,...
360,2021-12-27,163.020006,148.466667,-11.650000,361
361,2021-12-28,164.103905,111.252083,-2.041667,362
362,2021-12-29,161.262945,153.792083,0.454167,363
363,2021-12-30,155.425769,68.722500,3.275000,364


In [12]:
# Save DataFrame to CSV file
commercial_df_2021_resampled.to_csv('data/commercial_df_2021_resampled_vertexai_daily.csv', index=False, columns=['ID', 'DateTime', 'PowerConsumption', 'Price', 'Temperature'])

# Since AutoML on VertexAI needs ateleast 1000 data points for training the model, we consider all 4 years now for forecasting (which will be 4*365 = 1460)

In [13]:
df  = pd.read_csv('data/modified_final_df.csv', sep=',', index_col=0)

In [14]:
df.shape[0]

91068432

In [15]:
commercial_df = df[df['ISPRIVATEPERSON'] == 'Nej']

In [17]:
commercial_df.shape[0]


30568272

In [21]:
# Drop columns that are not needed for training
columns_to_drop = ['One_Day_Power', 'One_Day_Power_NaN', 'Air Temperature 1 Min_P19','Air Temperature 2 Max_P27', 'Air Temperature 2 Min_P26', 'Precipitation_P7']
commercial_df = commercial_df.drop(columns=columns_to_drop)

# Rename columns
commercial_df = commercial_df.rename(columns={'Dew Point Temperature_P39': 'Temperature', 
                         'Relative Humidity_P6': 'Humidity',
                         'Wind Speed_P4': 'WindSpeed',
                         'Power_Consumption': 'PowerConsumption'
                         # Add more renaming mappings as needed
                        })

# Add an ID column
commercial_df.insert(0, 'ID', range(1, len(df) + 1))  # Insert ID column as the first column

In [22]:
commercial_df

Unnamed: 0,ID,DateTime,CUSTOMER,AREA,ISPRIVATEPERSON,PowerConsumption,Temperature,Humidity,WindSpeed,Price
0,1,2020-01-01 00:00:00,1060598736,Kvarnholmen,Nej,0.011200,-2.4,80.0,5.0,28.45
1,2,2020-01-01 00:00:00,1060753918,Malmen,Nej,0.001797,-2.4,80.0,5.0,28.45
2,3,2020-01-01 00:00:00,1060753924,Malmen,Nej,0.021600,-2.4,80.0,5.0,28.45
3,4,2020-01-01 00:00:00,1060753932,Malmen,Nej,0.004122,-2.4,80.0,5.0,28.45
4,5,2020-01-01 00:00:00,1060753945,Malmen,Nej,0.001120,-2.4,80.0,5.0,28.45
...,...,...,...,...,...,...,...,...,...,...
91068410,30568268,2023-12-31 23:00:00,1060619312,Malmen,Nej,0.006591,4.8,88.0,3.0,29.56
91068411,30568269,2023-12-31 23:00:00,1060619299,Malmen,Nej,0.020784,4.8,88.0,3.0,29.56
91068422,30568270,2023-12-31 23:00:00,1060619200,Malmen,Nej,0.000412,4.8,88.0,3.0,29.56
91068428,30568271,2023-12-31 23:00:00,1060619139,Malmen,Nej,0.000028,4.8,88.0,3.0,29.56


# Make daily aggregation of data (expected 4*365 datapoints)

In [23]:
commercial_df['DateTime'] = pd.to_datetime(commercial_df['DateTime'])
commercial_df.set_index('DateTime', inplace=True)

In [24]:
commercial_df

Unnamed: 0_level_0,ID,CUSTOMER,AREA,ISPRIVATEPERSON,PowerConsumption,Temperature,Humidity,WindSpeed,Price
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-01-01 00:00:00,1,1060598736,Kvarnholmen,Nej,0.011200,-2.4,80.0,5.0,28.45
2020-01-01 00:00:00,2,1060753918,Malmen,Nej,0.001797,-2.4,80.0,5.0,28.45
2020-01-01 00:00:00,3,1060753924,Malmen,Nej,0.021600,-2.4,80.0,5.0,28.45
2020-01-01 00:00:00,4,1060753932,Malmen,Nej,0.004122,-2.4,80.0,5.0,28.45
2020-01-01 00:00:00,5,1060753945,Malmen,Nej,0.001120,-2.4,80.0,5.0,28.45
...,...,...,...,...,...,...,...,...,...
2023-12-31 23:00:00,30568268,1060619312,Malmen,Nej,0.006591,4.8,88.0,3.0,29.56
2023-12-31 23:00:00,30568269,1060619299,Malmen,Nej,0.020784,4.8,88.0,3.0,29.56
2023-12-31 23:00:00,30568270,1060619200,Malmen,Nej,0.000412,4.8,88.0,3.0,29.56
2023-12-31 23:00:00,30568271,1060619139,Malmen,Nej,0.000028,4.8,88.0,3.0,29.56


In [26]:
commercial_df_resampled = commercial_df.resample('D').agg({'PowerConsumption': 'sum', 'Price': 'mean', 'Temperature':'mean'})

In [27]:
# Reset index to make DateTime a column again
commercial_df_resampled.reset_index(inplace=True)

# Assign new IDs
commercial_df_resampled['ID'] = range(1, len(commercial_df_resampled) + 1)

In [28]:
commercial_df_resampled

Unnamed: 0,DateTime,PowerConsumption,Price,Temperature,ID
0,2020-01-01,125.153777,27.040417,0.991667,1
1,2020-01-02,148.917767,27.050833,0.650000,2
2,2020-01-03,148.034055,22.752917,2.612500,3
3,2020-01-04,133.919142,21.372083,-2.400000,4
4,2020-01-05,129.457823,29.641250,-3.183333,5
...,...,...,...,...,...
1456,2023-12-27,230.203512,55.553333,4.800000,1457
1457,2023-12-28,224.203430,35.892500,4.800000,1458
1458,2023-12-29,218.482716,27.819167,4.800000,1459
1459,2023-12-30,195.574116,42.055833,4.800000,1460


In [29]:
# Save DataFrame to CSV file
commercial_df_resampled.to_csv('data/commercial_df_resampled_vertexai_daily.csv', index=False, columns=['ID', 'DateTime', 'PowerConsumption', 'Price', 'Temperature'])