In [1]:
import pandas as pd
import pytz
import numpy as np
import os
from sklearn import preprocessing
import re
import matplotlib
from matplotlib.patches import Polygon, Rectangle
matplotlib.use('Qt5Agg')
from datetime import timedelta
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from casadi import *
import calendar
import casadi as cd
from sklearn.linear_model import LinearRegression
import matplotlib.dates as mdates
import warnings
warnings.filterwarnings('ignore')

In [2]:
def custom_date_parser(date_string):
    return pd.to_datetime(date_string, format='%d-%m-%Y %H:%M:%S')

# Specify the path to the main directory containing folders and files
path = 'D:\\mlinternship\\iitgdata'
folders = [folder for folder in os.listdir(path) if os.path.isdir(os.path.join(path, folder))]
df_list = []

# Iterate through each folder
for folder in folders:
    # Construct the full path to the current folder
    folder_path = os.path.join(path, folder)
    # Iterate through files in the current folder
    for filename in os.listdir(folder_path):
        # Check if the file has the '.xlsx' extension
        if filename.endswith('.xlsx'):
            # Construct the full path to the Excel file
            file_path = os.path.join(folder_path, filename)
            # Use the custom date parser function
            df = pd.read_excel(file_path, header=3, date_parser=custom_date_parser)
            # Append the dataframe to the list
            df_list.append(df)

bill_path = 'D:\\mlinternship\\IITGuwahatiElectricityBills'
for filename in os.listdir(bill_path):
    # Check if the file has the '.xlsx' extension
    if filename.endswith('.xlsx'):
        file_path = os.path.join(bill_path, filename)
        bill_df = pd.read_excel(file_path)

bill_df['Month'] = pd.to_datetime(bill_df['Month'])
#I assumed its in kilowatts
bill_df['MW'] = bill_df['Number of units of electricity consumed']/1000
bill_df.drop(['Number of units of electricity consumed'], axis=1)
bill_df

Unnamed: 0,Month,MW
0,2022-01-01,1415.4246
1,2022-02-01,1353.426
2,2022-03-01,2254.4145
3,2022-04-01,2269.101
4,2022-05-01,2690.415
5,2022-06-01,2708.491
6,2022-07-01,3113.6625
7,2022-08-01,3606.0495
8,2022-09-01,3335.319
9,2022-10-01,2787.5835


Unnamed: 0,Month,Number of units of electricity consumed,MW
0,2022-01-01,1415424.6,1415.4246
1,2022-02-01,1353426.0,1353.426
2,2022-03-01,2254414.5,2254.4145
3,2022-04-01,2269101.0,2269.101
4,2022-05-01,2690415.0,2690.415
5,2022-06-01,2708491.0,2708.491
6,2022-07-01,3113662.5,3113.6625
7,2022-08-01,3606049.5,3606.0495
8,2022-09-01,3335319.0,3335.319
9,2022-10-01,2787583.5,2787.5835


In [3]:
temperature_data_path = 'D:\\mlinternship\\iitgdata\\temperaturedata\\report'
temperature_df_list = []
for filename in os.listdir(temperature_data_path):
        # Check if the file has the '.xlsx' extension
        if filename.endswith('.xlsx'):
            # Construct the full path to the Excel file
            file_path = os.path.join(temperature_data_path, filename)
            # Use the custom date parser function
            df = pd.read_excel(file_path, header=18, date_parser=custom_date_parser)
            df = df[['DATE(YYYY-MM-DD)', 'TIME (UTC)', "TEMP. ('C)"]]
            # Append the dataframe to the list
            temperature_df_list.append(df)
temperature_df = pd.concat(temperature_df_list, ignore_index=True)
temperature_df['Time'] = pd.to_datetime(temperature_df['DATE(YYYY-MM-DD)'] + ' ' + temperature_df['TIME (UTC)'])
# Rename the 'TEMP. ('C)' column to 'temperature'
temperature_df.rename(columns={"TEMP. ('C)": 'temperature'}, inplace=True)
# Drop the 'DATE(YYYY-MM-DD)' and 'TIME (UTC)' columns from temperature_df
temperature_df = temperature_df.drop(['DATE(YYYY-MM-DD)', 'TIME (UTC)'], axis=1)
temperature_df = temperature_df.sort_values(by='Time')
temperature_df.reset_index(drop = True, inplace = True)
temperature_df

Unnamed: 0,temperature,Time
0,22.0,2023-05-03 00:00:00
1,22.3,2023-05-03 00:15:00
2,22.5,2023-05-03 00:30:00
3,23.2,2023-05-03 00:45:00
4,23.1,2023-05-03 01:00:00
...,...,...
4423,26.3,2023-06-27 22:45:00
4424,26.2,2023-06-27 23:00:00
4425,26.2,2023-06-27 23:15:00
4426,26.3,2023-06-27 23:30:00


In [4]:
#read the power data
power_df = pd.concat(df_list, ignore_index=True)

power_df['Time'] = pd.to_datetime(power_df['Time'])
power_df['Time'] = power_df['Time'].round('min')
#replace all the 'NR' values in MW column to NaN
power_df['MW'] = power_df['MW'].replace('NR', np.nan)
power_df['MW'] = power_df['MW'].replace('nr', np.nan)
power_df = power_df[['Time', 'MW']]
power_df['MW'] = power_df['MW'].astype(str)
power_df['MW'] = pd.to_numeric(power_df['MW'].str.replace(',', '.'), errors='coerce')
power_df['Time'] = pd.to_datetime(power_df['Time'])
power_df = power_df.sort_values('Time')
power_df.to_csv('power_datacsv.csv')
full_power_df = power_df.copy()

# read the temperature data csv
'''
temperature_data_csv_path = 'D:\\mlinternship\\iitgdata\\temperaturedata'
filename = 'guwahati_temperature_data.csv'
file = os.path.join(temperature_data_csv_path, filename)
temperature_df = pd.read_csv(file)
temperature_df.rename(columns={'valid': 'Time'}, inplace = True)
temperature_df = temperature_df.rename(columns={'tmpc': 'temperature'})
temperature_df = temperature_df[['Time', 'temperature']]
temperature_df['Time'] = pd.to_datetime(temperature_df['Time'])
temperature_df['Time'] = pd.DatetimeIndex(temperature_df['Time']) + timedelta(hours=5,minutes=30)
temperature_df['temperature'] = pd.to_numeric(temperature_df['temperature'], errors='coerce')
temperature_df.set_index('Time', inplace=True)
temperature_df['temperature'] = temperature_df['temperature'].interpolate(method='polynomial', order = 5)
temperature_df.reset_index(inplace=True)
'''
# joining the two dataframes such that the temperature data is only taken if there exists a reading in the power data dataframe
df = pd.merge(power_df, temperature_df, on='Time', how='left')
df['temperature'] = df['temperature'].interpolate(method='polynomial', order = 5)
df['Time'] = pd.to_datetime(df['Time'])

full_model_part1_start_time = pd.Timestamp('2023-05-03 00:00:00')
full_model_part1_end_time = pd.Timestamp('2023-06-06 23:00:00')
full_model_part2_start_time =  pd.Timestamp('2023-06-16 00:00:00')
full_model_part2_end_time = pd.Timestamp('2023-06-27 23:00:00')
df = df[((df['Time'] >= full_model_part1_start_time) & (df['Time'] <= full_model_part1_end_time)) | ((df['Time'] >= full_model_part2_start_time) & (df['Time'] <= full_model_part2_end_time)) ]
df = df.drop(df[df['MW'] > 20].index)
df = df.sort_values('Time')
df.reset_index(drop=True)
df

"\ntemperature_data_csv_path = 'D:\\mlinternship\\iitgdata\\temperaturedata'\nfilename = 'guwahati_temperature_data.csv'\nfile = os.path.join(temperature_data_csv_path, filename)\ntemperature_df = pd.read_csv(file)\ntemperature_df.rename(columns={'valid': 'Time'}, inplace = True)\ntemperature_df = temperature_df.rename(columns={'tmpc': 'temperature'})\ntemperature_df = temperature_df[['Time', 'temperature']]\ntemperature_df['Time'] = pd.to_datetime(temperature_df['Time'])\ntemperature_df['Time'] = pd.DatetimeIndex(temperature_df['Time']) + timedelta(hours=5,minutes=30)\ntemperature_df['temperature'] = pd.to_numeric(temperature_df['temperature'], errors='coerce')\ntemperature_df.set_index('Time', inplace=True)\ntemperature_df['temperature'] = temperature_df['temperature'].interpolate(method='polynomial', order = 5)\ntemperature_df.reset_index(inplace=True)\n"

Unnamed: 0,Time,MW,temperature
0,2023-05-03 00:00:00,3.0,22.000000
1,2023-05-03 01:00:00,,23.100000
2,2023-05-03 02:00:00,,25.800000
3,2023-05-03 03:00:00,2.5,27.800000
4,2023-05-03 04:00:00,,29.500000
...,...,...,...
1124,2023-06-27 19:00:00,5.7,26.900000
1125,2023-06-27 20:00:00,5.6,26.700000
1126,2023-06-27 21:00:00,5.3,26.504635
1127,2023-06-27 22:00:00,4.8,26.600000


Unnamed: 0,Time,MW,temperature
7175,2023-05-03 00:00:00,3.0,22.000000
7176,2023-05-03 01:00:00,,23.100000
7177,2023-05-03 02:00:00,,25.800000
7178,2023-05-03 03:00:00,2.5,27.800000
7179,2023-05-03 04:00:00,,29.500000
...,...,...,...
8515,2023-06-27 19:00:00,5.7,26.900000
8516,2023-06-27 20:00:00,5.6,26.700000
8517,2023-06-27 21:00:00,5.3,26.504635
8518,2023-06-27 22:00:00,4.8,26.600000


In [5]:
TcoolStPt = 31
CDH = df['temperature'] - TcoolStPt
CDH.clip(lower=0, inplace=True)
CDH = pd.DataFrame(data=CDH.values, columns=['CDH'], index=df.index)
# Concatenate CDH with the original DataFrame using the index
df = pd.concat([df, CDH], axis=1)
df = df.sort_values('Time')
df.reset_index(inplace=True, drop = True)

numOmegas = 24 * 7
num_of_rows = df.shape[0]
omegas = np.zeros((num_of_rows, numOmegas))  # Assuming numOmegas columns for omegas
concatenated_data = np.concatenate((df, omegas), axis=1)
column_names = ['Time', 'MW', 'temperature', 'CDH']
for i in range(1, numOmegas + 1,1):
    column_names.append('omega' + str(i))

df = pd.DataFrame(concatenated_data, columns=column_names)
df['Time'] = pd.to_datetime(df['Time'])
for i in range(0,num_of_rows):
        datetime = df.Time.loc[i]
        hourOfWeekIndex = int(datetime.dayofweek*24+(datetime.hour+1))
        x = np.zeros((1,numOmegas))
        x[0,hourOfWeekIndex-1]=1
        omegas[i,:]=x

df.iloc[:,4:]=omegas
DF = df.copy()
df = df.dropna()
df.reset_index(inplace=True, drop = True)
df


Unnamed: 0,Time,MW,temperature,CDH,omega1,omega2,omega3,omega4,omega5,omega6,...,omega159,omega160,omega161,omega162,omega163,omega164,omega165,omega166,omega167,omega168
0,2023-05-03 00:00:00,3.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-05-03 03:00:00,2.5,27.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-05-03 05:00:00,3.0,31.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-05-03 06:00:00,3.5,33.1,2.1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-05-03 07:00:00,3.75,29.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,2023-06-27 19:00:00,5.7,26.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
886,2023-06-27 20:00:00,5.6,26.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,2023-06-27 21:00:00,5.3,26.504635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,2023-06-27 22:00:00,4.8,26.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
plt.figure(figsize=(10, 5))
plt.ion()
plt.plot(full_power_df['Time'], full_power_df['MW'], marker='o', linestyle='-', color='r', label='MW')
plt.xlabel('Time')
plt.xticks(rotation=45)
ax = plt.gca()
ax.set_facecolor('white')
ax.set_title('power vs time')
plt.legend()
plt.tight_layout()
plt.show()

<Figure size 1000x500 with 0 Axes>

<contextlib.ExitStack at 0x10b20a04e50>

[<matplotlib.lines.Line2D at 0x10b20ebad90>]

Text(0.5, 0, 'Time')

(array([18993., 19052., 19113., 19174., 19236., 19297., 19358., 19417.,
        19478., 19539.]),
 [Text(18993.0, 0, '2022-01'),
  Text(19052.0, 0, '2022-03'),
  Text(19113.0, 0, '2022-05'),
  Text(19174.0, 0, '2022-07'),
  Text(19236.0, 0, '2022-09'),
  Text(19297.0, 0, '2022-11'),
  Text(19358.0, 0, '2023-01'),
  Text(19417.0, 0, '2023-03'),
  Text(19478.0, 0, '2023-05'),
  Text(19539.0, 0, '2023-07')])

Text(0.5, 1.0, 'power vs time')

<matplotlib.legend.Legend at 0x10b20e9a090>