In [1]:
import pandas as pd
import pytz
import numpy as np
import os
from sklearn import preprocessing
import re
import matplotlib
matplotlib.use('Qt5Agg')
from datetime import timedelta
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')

In [2]:
def custom_date_parser(date_string):
    return pd.to_datetime(date_string, format='%d-%m-%Y %H:%M:%S')

# Specify the path to the main directory containing folders and files
path = 'D:\\mlinternship\\iitgdata'
folders = [folder for folder in os.listdir(path) if os.path.isdir(os.path.join(path, folder))]
df_list = []

# Iterate through each folder
for folder in folders:
    # Construct the full path to the current folder
    folder_path = os.path.join(path, folder)

    # Iterate through files in the current folder
    for filename in os.listdir(folder_path):
        # Check if the file has the '.xlsx' extension
        if filename.endswith('.xlsx'):
            # Construct the full path to the Excel file
            file_path = os.path.join(folder_path, filename)

            # Use the custom date parser function
            df = pd.read_excel(file_path, header=3, date_parser=custom_date_parser)

            # Append the dataframe to the list
            df_list.append(df)


In [3]:
power_df = pd.concat(df_list, ignore_index=True)
power_df.to_csv('power_datacsv.csv')
power_df['Time'] = pd.to_datetime(power_df['Time'])
power_df['Time'] = power_df['Time'].round('min')
#replace all the 'NR' values in MW column to NaN
power_df['MW'] = power_df['MW'].replace('NR', np.nan)
power_df['MW'] = power_df['MW'].replace('nr', np.nan)
full_power_df = power_df.copy()
# drop the rest of the columns that are not required and set index to Time
power_df = power_df[['Time', 'MW']]
#power_df.dropna(subset=['Time'], inplace=True)
power_df.dropna(inplace = True)
power_df['MW'] = power_df['MW'].astype(str)
# Remove commas from 'CDH' column and convert to numeric
power_df['MW'] = pd.to_numeric(power_df['MW'].str.replace(',', '.'), errors='coerce')
power_df['Time'] = pd.to_datetime(power_df['Time'])
# Now, 'Time' column is in UTC timezone format
power_df = power_df.sort_values('Time')

In [4]:
power_df

Unnamed: 0,Time,MW
2933,2022-01-21 02:00:00,2.5
2957,2022-01-22 02:00:00,2.1
3005,2022-01-24 02:00:00,2.4
3101,2022-01-28 02:00:00,2.0
1494,2022-02-01 06:00:00,1.7
...,...,...
4774,2023-06-27 19:00:00,5.7
4775,2023-06-27 20:00:00,5.6
4776,2023-06-27 21:00:00,5.3
4777,2023-06-27 22:00:00,4.8


In [5]:
plt.figure(figsize=(10, 5))
plt.ion()
plt.plot(power_df['Time'], power_df['MW'], marker='o', linestyle='-', color='r', label='MW')

plt.xlabel('Time')

plt.xticks(rotation=45)

ax = plt.gca()
ax.set_facecolor('white')
ax.set_title('power vs time (not normalized)')
plt.legend()
plt.tight_layout()

plt.show()

<Figure size 1000x500 with 0 Axes>

<contextlib.ExitStack at 0x209757bab50>

[<matplotlib.lines.Line2D at 0x2097567f110>]

Text(0.5, 0, 'Time')

(array([18993., 19052., 19113., 19174., 19236., 19297., 19358., 19417.,
        19478., 19539.]),
 [Text(18993.0, 0, '2022-01'),
  Text(19052.0, 0, '2022-03'),
  Text(19113.0, 0, '2022-05'),
  Text(19174.0, 0, '2022-07'),
  Text(19236.0, 0, '2022-09'),
  Text(19297.0, 0, '2022-11'),
  Text(19358.0, 0, '2023-01'),
  Text(19417.0, 0, '2023-03'),
  Text(19478.0, 0, '2023-05'),
  Text(19539.0, 0, '2023-07')])

Text(0.5, 1.0, 'power vs time (not normalized)')

<matplotlib.legend.Legend at 0x209758fff90>

In [6]:
# read the temperature data csv
temperature_data_csv_path = 'D:\\mlinternship\\iitgdata\\temperaturedata'
filename = 'guwahati_temperature_data.csv'
file = os.path.join(temperature_data_csv_path, filename)
temperature_df = pd.read_csv(file)
#drop the unwanted columns and change column name
temperature_df.rename(columns={'valid': 'Time'}, inplace = True)
temperature_df = temperature_df.rename(columns={'tmpc': 'temperature'})
temperature_df = temperature_df[['Time', 'temperature']]

temperature_df['Time'] = pd.to_datetime(temperature_df['Time'])
temperature_df['Time'] = pd.DatetimeIndex(temperature_df['Time']) + timedelta(hours=5,minutes=30)
temperature_df['temperature'] = pd.to_numeric(temperature_df['temperature'], errors='coerce')


temperature_df.set_index('Time', inplace=True)
temperature_df['temperature'] = temperature_df['temperature'].interpolate(method='polynomial', order = 5)
temperature_df.reset_index(inplace=True)

In [7]:
temperature_df

Unnamed: 0,Time,temperature
0,2022-01-01 05:30:00,13.0
1,2022-01-01 06:00:00,12.0
2,2022-01-01 06:30:00,12.0
3,2022-01-01 07:00:00,12.0
4,2022-01-01 07:30:00,14.0
...,...,...
29230,2023-10-29 23:30:00,22.0
29231,2023-10-30 00:00:00,22.0
29232,2023-10-30 00:30:00,22.0
29233,2023-10-30 04:30:00,20.0


In [8]:


# plotting temperature vs time
plt.figure(figsize=(10, 5))
plt.ion()
plt.plot(temperature_df['Time'], temperature_df['temperature'], marker='o', linestyle='-', color='r', label='temperature')

plt.xlabel('Time')

plt.xticks(rotation=45)

ax = plt.gca()
ax.set_facecolor('white')
ax.set_title('temperature vs time')
plt.legend()
plt.tight_layout()

plt.show()


<Figure size 1000x500 with 0 Axes>

<contextlib.ExitStack at 0x209755e0ad0>

[<matplotlib.lines.Line2D at 0x209755524d0>]

Text(0.5, 0, 'Time')

(array([18993., 19083., 19174., 19266., 19358., 19448., 19539., 19631.]),
 [Text(18993.0, 0, '2022-01'),
  Text(19083.0, 0, '2022-04'),
  Text(19174.0, 0, '2022-07'),
  Text(19266.0, 0, '2022-10'),
  Text(19358.0, 0, '2023-01'),
  Text(19448.0, 0, '2023-04'),
  Text(19539.0, 0, '2023-07'),
  Text(19631.0, 0, '2023-10')])

Text(0.5, 1.0, 'temperature vs time')

<matplotlib.legend.Legend at 0x2097538c590>

In [9]:
common_times = set(power_df['Time']).intersection(set(temperature_df['Time']))
num_common_rows = len(common_times)

print(f'Number of rows with same values in "Time" column: {num_common_rows}')

Number of rows with same values in "Time" column: 5477


In [10]:
# joining the two dataframes such that the temperature data is only taken if there exists a reading in the power data dataframe
df = pd.merge(power_df, temperature_df, on='Time', how='left')
DF = pd.merge(full_power_df, temperature_df, on='Time', how='left')
df['Time'] = pd.to_datetime(df['Time'])
#drop all the rows where NaN
df.dropna(inplace=True)
df = df[df['MW'] <= 20]
df = df.sort_values('Time')
df.reset_index(inplace=True, drop = True)
df


Unnamed: 0,Time,MW,temperature
0,2022-01-24 02:00:00,2.4,16.0
1,2022-01-28 02:00:00,2.0,14.0
2,2022-02-01 06:00:00,1.7,12.0
3,2022-02-01 07:00:00,1.8,13.0
4,2022-02-01 08:00:00,2.3,14.0
...,...,...,...
5480,2023-06-27 19:00:00,5.7,28.0
5481,2023-06-27 20:00:00,5.6,28.0
5482,2023-06-27 21:00:00,5.3,28.0
5483,2023-06-27 22:00:00,4.8,28.0


In [11]:
temporary_df = df.copy()
# Convert 'Time' column to datetime if it's not already
temporary_df['Time'] = pd.to_datetime(temporary_df['Time'])
# Set 'Time' column as the index
temporary_df.set_index('Time', inplace=True)
# Resample the data to get weekly averages
weekly_avg_temperature = temporary_df['temperature'].resample('W').mean()
# Plot the weekly average temperature
plt.figure(figsize=(12, 6))
plt.plot(weekly_avg_temperature.index, weekly_avg_temperature.values, marker='o', linestyle='-')
plt.xlabel('Week')
plt.ylabel('Average Temperature')
plt.title('Weekly Average Temperature')
plt.grid(True)
plt.show()


<Figure size 1200x600 with 0 Axes>

[<matplotlib.lines.Line2D at 0x20976261ad0>]

Text(0.5, 0, 'Week')

Text(0, 0.5, 'Average Temperature')

Text(0.5, 1.0, 'Weekly Average Temperature')

In [12]:
df['Month'] = df['Time'].dt.month
df['Year'] = df['Time'].dt.year

# Group by month and year, and count the number of data entries
monthly_data_counts = df.groupby(['Year', 'Month']).size().reset_index(name='DataCount')

# Sort the data counts in descending order
sorted_monthly_data_counts = monthly_data_counts.sort_values(by='DataCount', ascending=False)

# List of months with most data available in descending order
months_with_most_data = []
for index, row in sorted_monthly_data_counts.iterrows():
    month_name = pd.Timestamp(year=row['Year'], month=row['Month'], day=1).strftime('%B %Y')
    data_count = row['DataCount']
    months_with_most_data.append((month_name, data_count))

print("Months with most data available in descending order:")
for month, data_count in months_with_most_data:
    print(f"{month}: {data_count} data points")

columns_to_drop = ['Year', 'Month']
df.drop(columns=columns_to_drop, inplace=True)

Months with most data available in descending order:
May 2023: 545 data points
June 2023: 515 data points
August 2022: 489 data points
July 2022: 467 data points
March 2022: 457 data points
February 2022: 447 data points
October 2022: 437 data points
November 2022: 304 data points
May 2022: 268 data points
March 2023: 252 data points
December 2022: 252 data points
June 2022: 242 data points
January 2023: 222 data points
September 2022: 219 data points
April 2022: 214 data points
February 2023: 153 data points
January 2022: 2 data points


In [13]:
full_model_start_time = pd.Timestamp('2022-02-07 00:00:00')
full_model_end_time = pd.Timestamp('2022-12-25 23:00:00')
df = df.sort_values('Time')
df = df[(df['Time'] >= full_model_start_time) & (df['Time'] <= full_model_end_time)]

df.reset_index(drop=True)
df

Unnamed: 0,Time,MW,temperature
0,2022-02-07 00:00:00,2.18,12.0
1,2022-02-07 03:00:00,1.71,12.0
2,2022-02-07 05:00:00,1.70,12.0
3,2022-02-07 14:00:00,2.50,22.0
4,2022-02-07 15:00:00,2.50,22.0
...,...,...,...
3598,2022-12-25 15:00:00,2.40,24.0
3599,2022-12-25 17:00:00,2.60,21.0
3600,2022-12-25 19:00:00,2.60,18.0
3601,2022-12-25 21:00:00,2.60,18.0


Unnamed: 0,Time,MW,temperature
103,2022-02-07 00:00:00,2.18,12.0
104,2022-02-07 03:00:00,1.71,12.0
105,2022-02-07 05:00:00,1.70,12.0
106,2022-02-07 14:00:00,2.50,22.0
107,2022-02-07 15:00:00,2.50,22.0
...,...,...,...
3701,2022-12-25 15:00:00,2.40,24.0
3702,2022-12-25 17:00:00,2.60,21.0
3703,2022-12-25 19:00:00,2.60,18.0
3704,2022-12-25 21:00:00,2.60,18.0


In [14]:
correlation_start_time = pd.Timestamp('2022-07-04 00:00:00')
correlation_end_time = pd.Timestamp('2022-08-28 23:00:00')
training_mask = (df['Time'] >= correlation_start_time) & (df['Time'] <= correlation_end_time)
mw_array = np.array(df['MW'][training_mask], dtype='float64')
correlation_values = []

for i in range(20, 40):
    TcoolStPt = i
    CDH = df['temperature'][training_mask] - TcoolStPt
    CDH.clip(lower=0, inplace=True)
    CDH = pd.DataFrame(data=CDH.values, columns=['CDH'], index=df[training_mask].index)
    temp_array = np.array(CDH['CDH'], dtype='float64')
    correlation_coefficient = np.corrcoef(mw_array, temp_array)[0, 1]
    correlation_values.append((TcoolStPt, correlation_coefficient))

# Sort the correlation values list in descending order based on correlation coefficient
sorted_correlation_values = sorted(correlation_values, key=lambda x: x[1], reverse=True)

# Print the TcoolStPt values with highest correlation coefficients in descending order
for tcool_stpt, corr_coeff in sorted_correlation_values:
    print(f'TcoolStPt: {tcool_stpt}, Correlation Coefficient: {corr_coeff}')

TcoolStPt: 20, Correlation Coefficient: 0.33506987356128504
TcoolStPt: 21, Correlation Coefficient: 0.33506987356128487
TcoolStPt: 22, Correlation Coefficient: 0.33506987356128487
TcoolStPt: 23, Correlation Coefficient: 0.33506987356128487
TcoolStPt: 24, Correlation Coefficient: 0.33506987356128487
TcoolStPt: 25, Correlation Coefficient: 0.33506987356128487
TcoolStPt: 26, Correlation Coefficient: 0.3332339229685242
TcoolStPt: 27, Correlation Coefficient: 0.3260764260512506
TcoolStPt: 28, Correlation Coefficient: 0.30692487715279465
TcoolStPt: 29, Correlation Coefficient: 0.2639424385073419
TcoolStPt: 30, Correlation Coefficient: 0.2111712640247728
TcoolStPt: 31, Correlation Coefficient: 0.17575610675706818
TcoolStPt: 32, Correlation Coefficient: 0.14074346886729133
TcoolStPt: 33, Correlation Coefficient: 0.07722296457902907
TcoolStPt: 36, Correlation Coefficient: 0.0038029578442890766
TcoolStPt: 34, Correlation Coefficient: 0.003618685986427374
TcoolStPt: 35, Correlation Coefficient: -

In [15]:
'''TcoolStPt = 31
CDH = df['temperature'] - TcoolStPt
CDH.clip(lower=0, inplace=True)
CDH = pd.DataFrame(data=CDH.values, columns=['CDH'])
df = pd.concat([df, CDH['CDH']], axis=1)'''

TcoolStPt = 31
CDH = df['temperature'] - TcoolStPt
CDH.clip(lower=0, inplace=True)
CDH = pd.DataFrame(data=CDH.values, columns=['CDH'], index=df.index)
# Concatenate CDH with the original DataFrame using the index
df = pd.concat([df, CDH], axis=1)

CDH = DF['temperature'] - TcoolStPt
CDH.clip(lower=0, inplace=True)
CDH = pd.DataFrame(data=CDH.values, columns=['CDH'], index=DF.index)
# Concatenate CDH with the original DataFrame using the index
DF = pd.concat([DF, CDH], axis=1)
df

"TcoolStPt = 31\nCDH = df['temperature'] - TcoolStPt\nCDH.clip(lower=0, inplace=True)\nCDH = pd.DataFrame(data=CDH.values, columns=['CDH'])\ndf = pd.concat([df, CDH['CDH']], axis=1)"

Unnamed: 0,Time,MW,temperature,CDH
103,2022-02-07 00:00:00,2.18,12.0,0.0
104,2022-02-07 03:00:00,1.71,12.0,0.0
105,2022-02-07 05:00:00,1.70,12.0,0.0
106,2022-02-07 14:00:00,2.50,22.0,0.0
107,2022-02-07 15:00:00,2.50,22.0,0.0
...,...,...,...,...
3701,2022-12-25 15:00:00,2.40,24.0,0.0
3702,2022-12-25 17:00:00,2.60,21.0,0.0
3703,2022-12-25 19:00:00,2.60,18.0,0.0
3704,2022-12-25 21:00:00,2.60,18.0,0.0


In [16]:
mw_array = np.array(df['MW'], dtype='float64')
temp_array = np.array(df['CDH'], dtype='float64')
correlation_coefficient = np.corrcoef(mw_array, temp_array)[0, 1]
print('correlation between CDH and power: ', correlation_coefficient)

correlation between CDH and power:  0.24590312440133627


In [17]:
df.dropna(inplace=True)
df = df.sort_values('Time')
df.reset_index(inplace=True, drop = True)

In [18]:

plt.ion()
df['MW'] = pd.to_numeric(df['MW'], errors = 'coerce')
f,ax = plt.subplots(nrows = 1, figsize=(5,5))
#plt.plot(df['temperature'], df['MW'])
plt.scatter(df['temperature'], df['MW'],color='red')
ax.set_title('MW vs Temperature')

power_normalized = preprocessing.normalize([df['MW']]).reshape(-1,1)
temperature_normalized = preprocessing.normalize([df['temperature']]).reshape(-1,1)

plt.figure(figsize=(10, 5))
plt.ion()
plt.plot(df['Time'], df['MW'], marker='o', linestyle='-', color='r', label='MW')
plt.plot(df['Time'], df['temperature'], marker='o', linestyle='-', color='y', label='temp')
plt.xlabel('Time')
plt.xticks(rotation=45)
ax = plt.gca()
ax.set_facecolor('white')
ax.set_title('power and temperature vs time (not normalized)')
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 5))
plt.ion()
plt.plot(df['Time'], power_normalized, marker='o', linestyle='-', color='r', label='MW')
plt.plot(df['Time'], temperature_normalized, marker='o', linestyle='-', color='y', label='temp')
plt.xlabel('Time')
plt.xticks(rotation=45)
ax = plt.gca()
ax.set_facecolor('white')
ax.set_title('power and temperature vs time (normalized)')
plt.legend()
plt.tight_layout()
plt.show()

<contextlib.ExitStack at 0x209758af390>

<matplotlib.collections.PathCollection at 0x20975570f90>

Text(0.5, 1.0, 'MW vs Temperature')

<Figure size 1000x500 with 0 Axes>

<contextlib.ExitStack at 0x209755bc990>

[<matplotlib.lines.Line2D at 0x209759bfe50>]

[<matplotlib.lines.Line2D at 0x2097571a1d0>]

Text(0.5, 0, 'Time')

(array([19024., 19052., 19083., 19113., 19144., 19174., 19205., 19236.,
        19266., 19297., 19327., 19358.]),
 [Text(19024.0, 0, '2022-02'),
  Text(19052.0, 0, '2022-03'),
  Text(19083.0, 0, '2022-04'),
  Text(19113.0, 0, '2022-05'),
  Text(19144.0, 0, '2022-06'),
  Text(19174.0, 0, '2022-07'),
  Text(19205.0, 0, '2022-08'),
  Text(19236.0, 0, '2022-09'),
  Text(19266.0, 0, '2022-10'),
  Text(19297.0, 0, '2022-11'),
  Text(19327.0, 0, '2022-12'),
  Text(19358.0, 0, '2023-01')])

Text(0.5, 1.0, 'power and temperature vs time (not normalized)')

<matplotlib.legend.Legend at 0x20975666cd0>

<Figure size 1000x500 with 0 Axes>

<contextlib.ExitStack at 0x20943ef4610>

[<matplotlib.lines.Line2D at 0x209758dc050>]

[<matplotlib.lines.Line2D at 0x20975406790>]

Text(0.5, 0, 'Time')

(array([19024., 19052., 19083., 19113., 19144., 19174., 19205., 19236.,
        19266., 19297., 19327., 19358.]),
 [Text(19024.0, 0, '2022-02'),
  Text(19052.0, 0, '2022-03'),
  Text(19083.0, 0, '2022-04'),
  Text(19113.0, 0, '2022-05'),
  Text(19144.0, 0, '2022-06'),
  Text(19174.0, 0, '2022-07'),
  Text(19205.0, 0, '2022-08'),
  Text(19236.0, 0, '2022-09'),
  Text(19266.0, 0, '2022-10'),
  Text(19297.0, 0, '2022-11'),
  Text(19327.0, 0, '2022-12'),
  Text(19358.0, 0, '2023-01')])

Text(0.5, 1.0, 'power and temperature vs time (normalized)')

<matplotlib.legend.Legend at 0x209758df3d0>

In [19]:
numOmegas = 24 * 7
num_of_rows = df.shape[0]
omegas = np.zeros((num_of_rows, numOmegas))  # Assuming numOmegas columns for omegas
concatenated_data = np.concatenate((df, omegas), axis=1)
column_names = ['Time', 'MW', 'temperature', 'CDH']
for i in range(1, numOmegas + 1,1):
    column_names.append('omega' + str(i))

df = pd.DataFrame(concatenated_data, columns=column_names)
df['Time'] = pd.to_datetime(df['Time'])
for i in range(0,num_of_rows):
        datetime = df.Time.loc[i]
        hourOfWeekIndex = int(datetime.dayofweek*24+(datetime.hour+1))
        x = np.zeros((1,numOmegas))
        x[0,hourOfWeekIndex-1]=1
        omegas[i,:]=x

df.iloc[:,4:]=omegas
df

Unnamed: 0,Time,MW,temperature,CDH,omega1,omega2,omega3,omega4,omega5,omega6,...,omega159,omega160,omega161,omega162,omega163,omega164,omega165,omega166,omega167,omega168
0,2022-02-07 00:00:00,2.18,12.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-02-07 03:00:00,1.71,12.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-02-07 05:00:00,1.7,12.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022-02-07 14:00:00,2.5,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-02-07 15:00:00,2.5,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3598,2022-12-25 15:00:00,2.4,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3599,2022-12-25 17:00:00,2.6,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3600,2022-12-25 19:00:00,2.6,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3601,2022-12-25 21:00:00,2.6,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [20]:
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import itertools


In [21]:
#set timeframe for training data
start_time = pd.Timestamp('2022-07-04 00:00:00')
end_time = pd.Timestamp('2022-08-28 23:00:00')
training_mask = (df['Time'] >= start_time) & (df['Time'] <= end_time) & (df['CDH'] > 0)
training_df = df[training_mask]
#training_df = training_df.dropna(subset=['CDH'])
#training_df = training_df.dropna(subset=['MW'])

x_train = training_df.loc[:, 'CDH']
y_train = training_df.loc[:, 'MW']
t_train = training_df['Time']

x_train = pd.to_numeric(x_train, errors='coerce')
x_train = x_train.values.reshape(-1, 1)
y_train = np.array(y_train)

# For testing data, use data after end_time to testing_end_time
testing_end_time = pd.Timestamp('2022-10-15 23:00:00')
testing_mask = (df['Time'] > end_time) & (df['Time'] <= testing_end_time) & (df['CDH'] > 0)
testing_df = df[testing_mask]
testing_df = testing_df.dropna(subset=['CDH'])
testing_df = testing_df.dropna(subset=['MW'])
x_test = testing_df.loc[:, 'CDH']
y_test = testing_df.loc[:, 'MW']
t_test = testing_df['Time']

In [22]:
# Linear regression
temperature_model = LinearRegression()
temperature_model.fit(x_train, y_train)

predictions = temperature_model.predict(x_train)
coef_cooling = temperature_model.coef_
CDHmodelScore = temperature_model.score(x_train,y_train)

print("score of the model is ", CDHmodelScore)

'''
plt.figure(figsize=(10, 5))
plt.scatter(x_train, y_train, color='blue', label='Actual MW')
plt.scatter(x_train, predictions, color='red', label='Predicted MW')
plt.xlabel('x_train')
plt.ylabel('MW')
plt.title('Actual vs Predicted MW (using only CDH values to predict)')
plt.legend()
plt.show()
'''

score of the model is  2.3765267531672407e-09


"\nplt.figure(figsize=(10, 5))\nplt.scatter(x_train, y_train, color='blue', label='Actual MW')\nplt.scatter(x_train, predictions, color='red', label='Predicted MW')\nplt.xlabel('x_train')\nplt.ylabel('MW')\nplt.title('Actual vs Predicted MW (using only CDH values to predict)')\nplt.legend()\nplt.show()\n"

In [23]:
print('temperature model coefficient is: ', coef_cooling)

power_array = np.array(training_df['MW'], dtype='float64')
temp_array = np.array(training_df['CDH'], dtype='float64')
correlation_coefficient = np.corrcoef(power_array, temp_array)[0, 1]
print('correlation between CDH and power in training set: ', correlation_coefficient)

temperature model coefficient is:  [3.52073631e-05]
correlation between CDH and power in training set:  4.87496360957703e-05


In [24]:
training_mask = (df['Time'] >= start_time) & (df['Time'] <= end_time)
training_df = df[training_mask]
#Linear regression for the rest of the model
x_behavior = df[training_mask].loc[:,'omega1':'omega168']
y = pd.DataFrame(df['MW'][training_mask])
yhat_c = (coef_cooling*df.loc[:,'CDH'][training_mask].values).reshape(-1,1)
y_behavior = y - (yhat_c)
# Drop rows where y_behavior has NaN values
#y_behavior = y_behavior.dropna()
# Get the index of non-null values in y_behavior and use it to filter x_behavior
#non_null_index = y_behavior.index
#x_behavior_filtered = x_behavior.loc[non_null_index]
#x_behavior= x_behavior_filtered
time = df['Time'][training_mask]

## ------ fit the behavior model -----
behavior_model = LinearRegression(fit_intercept=False, positive=True)
behavior_model.verbose=False
behavior_model.fit(x_behavior,y_behavior)
#model.coef_,model.intercept_
behavior_modelScore = behavior_model.score(x_behavior.values,y_behavior.values)
print('score of the behavior model is '+str(behavior_modelScore))
yhat_behavior = behavior_model.predict(x_behavior.values)

predicted_data = pd.DataFrame({'Time': time, 'Predicted MW': yhat_behavior.reshape(-1)})
plt.figure(figsize=(10, 5))
plt.scatter(df['Time'][training_mask], yhat_behavior, color='red', label='Predicted MW')
plt.scatter(df['Time'][training_mask], y_behavior.values, color='blue', label='Actual MW')
plt.xlabel('Time')
plt.ylabel('MW')
plt.title('Actual vs Predicted MW (using behavior model)')
plt.legend()
plt.show()

'''
plt.figure(figsize=(10, 5))

plt.scatter(time, y_behavior.values, color='blue', label='Actual MW')
plt.scatter(time, yhat_behavior, color='red', label='Predicted MW')

# Set labels and title
plt.xlabel('time')
plt.ylabel('MW')
plt.title('Actual vs Predicted MW (using behavior model)')
plt.legend()
plt.show()
'''


score of the behavior model is 0.6570158572838127


<Figure size 1000x500 with 0 Axes>

<matplotlib.collections.PathCollection at 0x20901e4b6d0>

<matplotlib.collections.PathCollection at 0x20901e80dd0>

Text(0.5, 0, 'Time')

Text(0, 0.5, 'MW')

Text(0.5, 1.0, 'Actual vs Predicted MW (using behavior model)')

<matplotlib.legend.Legend at 0x2097569fb10>

"\nplt.figure(figsize=(10, 5))\n\nplt.scatter(time, y_behavior.values, color='blue', label='Actual MW')\nplt.scatter(time, yhat_behavior, color='red', label='Predicted MW')\n\n# Set labels and title\nplt.xlabel('time')\nplt.ylabel('MW')\nplt.title('Actual vs Predicted MW (using behavior model)')\nplt.legend()\nplt.show()\n"

In [25]:
behavior_model.coef_

array([[5.15      , 4.19      , 4.53666667, 0.        , 0.        ,
        4.0125    , 3.5       , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 5.22492518,
        5.22492518, 5.25710262, 5.29995473, 5.4666432 , 5.1285664 ,
        5.13333333, 5.        , 4.91428571, 5.        , 4.2       ,
        3.64      , 2.9       , 3.975     , 3.35      , 3.98571429,
        4.05      , 4.39999497, 4.6       , 4.7999956 , 5.3874868 ,
        5.24995306, 5.51243839, 5.44994132, 4.75      , 0.        ,
        5.5       , 0.        , 5.25      , 5.        , 0.        ,
        0.        , 4.975     , 4.975     , 4.3       , 0.        ,
        3.25      , 4.5       , 3.5       , 3.5875    , 3.75428571,
        4.10285714, 4.45714286, 4.56427566, 5.19997485, 5.23566902,
        5.51994467, 5.37494132, 5.61245159, 5.81245159, 6.02496479,
        5.8749824 , 5.8499912 , 5.825     , 5.6       , 5.4625    ,
        4.8       , 4.75      , 4.1       , 4.15

In [26]:
full_model_start_time = pd.Timestamp('2022-02-07 00:00:00')
full_model_end_time = pd.Timestamp('2022-12-25 23:00:00')

In [27]:
# constructing the full LR model using time and temperature models
c = np.concatenate((np.array([coef_cooling]).reshape(1,-1),behavior_model.coef_),axis=1)
full_model_mask = (df['Time'] >= full_model_start_time) & (df['Time'] <= full_model_end_time)
full_model = LinearRegression(fit_intercept=False)
full_model.coef_ = c
full_model.intercept_ = 0

final_df = df[full_model_mask]
final_df = final_df.sort_values('Time')
full_model_mask.reset_index(drop = True)
X = final_df.loc[:,'CDH':'omega168']
y = pd.DataFrame(final_df['MW'])

#y = y.dropna()
#X = X.loc[y.index]
#X = X.dropna()
#y = y.loc[X.index]
time = final_df.loc[y.index, 'Time']
yhat = full_model.predict(X.values)
pred = pd.DataFrame(data=yhat, columns=['yhat'], index=df.index)
# Concatenate CDH with the original DataFrame using the index
predictions_df = pd.concat([df, pred], axis=1)

full_modelScore = full_model.score(X,y)
print ('score for constructed full model on full data: ', full_modelScore)

fig,(ax1) = plt.subplots(nrows=1,figsize=(10,9))
_=ax1.plot(time,y,label='meas')
_=ax1.plot(time,yhat,label='pred: behavior+heat/cool')
ax1.set_title('measured vs predicted data (full constructed model)')
_=ax1.legend()

0       True
1       True
2       True
3       True
4       True
        ... 
3598    True
3599    True
3600    True
3601    True
3602    True
Name: Time, Length: 3603, dtype: bool

score for constructed full model on full data:  -0.8420273157550928


Text(0.5, 1.0, 'measured vs predicted data (full constructed model)')

In [28]:
modelscoreontrainingdata = full_model.score(X[training_mask], y[training_mask])
print("score of the fully constructed model on training data: ", modelscoreontrainingdata)

score of the fully constructed model on training data:  0.6570217813136736


In [29]:
# list of rows where model is predicting 0
predictions_df = predictions_df[predictions_df['yhat'] == 0]
predictions_df

Unnamed: 0,Time,MW,temperature,CDH,omega1,omega2,omega3,omega4,omega5,omega6,...,omega160,omega161,omega162,omega163,omega164,omega165,omega166,omega167,omega168,yhat
1,2022-02-07 03:00:00,1.71,12.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,2022-02-10 22:00:00,2.7,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49,2022-02-11 00:00:00,2.3,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,2022-02-11 01:00:00,2.2,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,2022-02-11 03:00:00,2.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3448,2022-12-16 05:00:00,2.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3551,2022-12-22 22:00:00,2.8,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3552,2022-12-22 23:00:00,2.4,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3553,2022-12-23 00:00:00,2.25,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
for i in range(1,169):
    sum = predictions_df[f'omega{i}'].sum()
    if (sum!=0):
        print(f"{i}:{sum}")


4:5.0
5:6.0
8:3.0
9:3.0
10:3.0
11:2.0
12:2.0
13:2.0
14:2.0
40:3.0
42:4.0
45:3.0
46:4.0
50:4.0
76:1.0
95:7.0
96:6.0
97:12.0
98:7.0
99:6.0
100:9.0
101:10.0
102:12.0
147:2.0
149:1.0


In [31]:
training_df[training_df['omega45']  > 0]

Unnamed: 0,Time,MW,temperature,CDH,omega1,omega2,omega3,omega4,omega5,omega6,...,omega159,omega160,omega161,omega162,omega163,omega164,omega165,omega166,omega167,omega168


In [32]:
training_df = training_df.sort_values('Time')
training_df

Unnamed: 0,Time,MW,temperature,CDH,omega1,omega2,omega3,omega4,omega5,omega6,...,omega159,omega160,omega161,omega162,omega163,omega164,omega165,omega166,omega167,omega168
1578,2022-07-04 14:00:00,5.0,34.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1579,2022-07-04 15:00:00,5.0,34.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1580,2022-07-04 16:00:00,5.0,34.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1581,2022-07-04 17:00:00,5.0,34.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1582,2022-07-04 19:00:00,4.8,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2439,2022-08-28 17:00:00,4.2,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2440,2022-08-28 18:00:00,4.2,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2441,2022-08-28 19:00:00,5.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2442,2022-08-28 21:00:00,5.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [33]:
df['Time'] = pd.to_datetime(df['Time'])
hours = df['Time'].dt.hour
df_12am_to_8am = df[(hours >= 0) & (hours <= 8)]
df_excluding_12am_to_8am = df[(hours < 0) | (hours > 8)]


In [34]:
df_12am_to_8am.dropna(inplace=True)
df_12am_to_8am = df_12am_to_8am[training_mask]
x = df_12am_to_8am.loc[:,'CDH':'omega168']
y = pd.DataFrame(df_12am_to_8am['MW'])
night_score = full_model.score(x,y)
print('model score for data between 12am to 8 am on training data is: ', night_score)

model score for data between 12am to 8 am on training data is:  0.36471888164937316


In [35]:
df_12am_to_8am

Unnamed: 0,Time,MW,temperature,CDH,omega1,omega2,omega3,omega4,omega5,omega6,...,omega159,omega160,omega161,omega162,omega163,omega164,omega165,omega166,omega167,omega168
1586,2022-07-05 01:00:00,3.1,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1587,2022-07-05 04:00:00,2.8,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1588,2022-07-05 05:00:00,4.1,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1589,2022-07-05 06:00:00,4.3,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1590,2022-07-05 07:00:00,4.3,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2427,2022-08-28 01:00:00,4.1,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2428,2022-08-28 05:00:00,4.7,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2429,2022-08-28 06:00:00,4.6,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2430,2022-08-28 07:00:00,4.6,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
STOP

NameError: name 'STOP' is not defined

In [None]:
start_time = pd.Timestamp('2022-10-03 00:00:00')
end_time = pd.Timestamp('2023-01-01 23:00:00')

In [None]:
training_mask = (df['Time'] >= start_time) & (df['Time'] <= end_time) & (df['CDH'] > 0)
training_df = df[training_mask]
#training_df = training_df.dropna(subset=['CDH'])
#training_df = training_df.dropna(subset=['MW'])

x_train = training_df.loc[:, 'CDH']
y_train = training_df.loc[:, 'MW']
t_train = training_df['Time']

x_train = pd.to_numeric(x_train, errors='coerce')
x_train = x_train.values.reshape(-1, 1)
y_train = np.array(y_train)
temperature_model = LinearRegression()
temperature_model.fit(x_train, y_train)

predictions = temperature_model.predict(x_train)
coef_cooling = temperature_model.coef_
CDHmodelScore = temperature_model.score(x_train,y_train)

print("score of the model is ", CDHmodelScore)

print('temperature model coefficient is: ', coef_cooling)

power_array = np.array(training_df['MW'], dtype='float64')
temp_array = np.array(training_df['CDH'], dtype='float64')
correlation_coefficient = np.corrcoef(power_array, temp_array)[0, 1]
print('correlation between CDH and power in training set: ', correlation_coefficient)


training_mask = (df['Time'] >= start_time) & (df['Time'] <= end_time)
#Linear regression for the rest of the model
x_behavior = df[training_mask].loc[:,'omega1':'omega168']
y = pd.DataFrame(df['MW'][training_mask])
yhat_c = (coef_cooling*df.loc[:,'CDH'][training_mask].values).reshape(-1,1)
y_behavior = y - (yhat_c)
# Drop rows where y_behavior has NaN values
#y_behavior = y_behavior.dropna()
# Get the index of non-null values in y_behavior and use it to filter x_behavior
#non_null_index = y_behavior.index
#x_behavior_filtered = x_behavior.loc[non_null_index]
#x_behavior= x_behavior_filtered
time = df['Time'][training_mask]

## ------ fit the behavior model -----
behavior_model = LinearRegression(fit_intercept=False, positive=True)
behavior_model.verbose=False
behavior_model.fit(x_behavior,y_behavior)
#model.coef_,model.intercept_
behavior_modelScore = behavior_model.score(x_behavior.values,y_behavior.values)
print('score of the behavior model is '+str(behavior_modelScore))
yhat_behavior = behavior_model.predict(x_behavior.values)

full_model_start_time = pd.Timestamp('2022-02-07 00:00:00')
full_model_end_time = pd.Timestamp('2022-12-25 23:00:00')
# constructing the full LR model using time and temperature models
c = np.concatenate((np.array([coef_cooling]).reshape(1,-1),behavior_model.coef_),axis=1)
full_model_mask = (df['Time'] >= full_model_start_time) & (df['Time'] <= full_model_end_time)
full_model = LinearRegression(fit_intercept=False)
full_model.coef_ = c
full_model.intercept_ = 0

final_df = df[full_model_mask]
final_df = final_df.sort_values('Time')
full_model_mask.reset_index(drop = True)
X = final_df.loc[:,'CDH':'omega168']
y = pd.DataFrame(final_df['MW'])

#y = y.dropna()
#X = X.loc[y.index]
#X = X.dropna()
#y = y.loc[X.index]
time = final_df.loc[y.index, 'Time']
yhat = full_model.predict(X.values)
pred = pd.DataFrame(data=yhat, columns=['yhat'], index=df.index)
# Concatenate CDH with the original DataFrame using the index
predictions_df = pd.concat([df, pred], axis=1)

full_modelScore = full_model.score(X,y)
print ('score for constructed full model: ', full_modelScore)

fig,(ax1) = plt.subplots(nrows=1,figsize=(10,9))
_=ax1.plot(time,y,label='meas')
_=ax1.plot(time,yhat,label='pred: behavior+heat/cool')
ax1.set_title('measured vs predicted data (full constructed model)')
_=ax1.legend()

In [None]:
modelscoreontrainingdata = full_model.score(X[training_mask], y[training_mask])
print("score of the fully constructed model on training data: ", modelscoreontrainingdata)

In [None]:
trials

In [None]:
full_model_mask = (df['Time'] >= full_model_start_time) & (df['Time'] <= full_model_end_time)
x = df[full_model_mask]['CDH'].values.reshape(-1,1)
y = df[full_model_mask]['MW'].values.reshape(-1,1)


In [None]:
temperature_model = LinearRegression()
temperature_model.fit(x, y)

predictions = temperature_model.predict(x_train)
coef_cooling = temperature_model.coef_
CDHmodelScore = temperature_model.score(x_train,y_train)

print("coeff of the model is ", temperature_model.coef_)

In [None]:
new_start_time  = pd.Timestamp('2022-07-09 12:00:00')
new_end_time = pd.Timestamp('2022-07-09 15:00:00')
new_model_mask = (df['Time'] >= new_start_time) & (df['Time'] <= new_end_time) & (df['CDH'] > 0)
x = df[new_model_mask]['CDH'].values.reshape(-1,1)
y = df[new_model_mask]['MW'].values.reshape(-1,1)
x = np.array([1.0, 2.5, 3.8]).reshape(-1, 1)
y = np.array([4.0, 4.5, 5.0]).reshape(-1, 1)


In [None]:
x

In [None]:
model = LinearRegression()
model.fit(x,y)
model.coef_

In [None]:
power_array = np.array(x, dtype='float64')
temp_array = np.array(y, dtype='float64')
correlation_coefficient = np.corrcoef(x, y)[0, 1]
print('correlation between CDH and power in training set: ', correlation_coefficient)


In [None]:
temp_array