In [None]:
import pandas as pd
import pytz
import numpy as np
import os
from sklearn import preprocessing
import re
import matplotlib
matplotlib.use('Qt5Agg')
from datetime import timedelta
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')

In [None]:
def custom_date_parser(date_string):
    return pd.to_datetime(date_string, format='%d-%m-%Y %H:%M:%S')

# Specify the path to the main directory containing folders and files
path = 'D:\\mlinternship\\iitgdata'
folders = [folder for folder in os.listdir(path) if os.path.isdir(os.path.join(path, folder))]
df_list = []

# Iterate through each folder
for folder in folders:
    # Construct the full path to the current folder
    folder_path = os.path.join(path, folder)

    # Iterate through files in the current folder
    for filename in os.listdir(folder_path):
        # Check if the file has the '.xlsx' extension
        if filename.endswith('.xlsx'):
            # Construct the full path to the Excel file
            file_path = os.path.join(folder_path, filename)

            # Use the custom date parser function
            df = pd.read_excel(file_path, header=3, date_parser=custom_date_parser)

            # Append the dataframe to the list
            df_list.append(df)


In [None]:
power_df = pd.concat(df_list, ignore_index=True)
power_df.to_csv('power_datacsv.csv')
power_df['Time'] = pd.to_datetime(power_df['Time'])
power_df['Time'] = power_df['Time'].round('min')
#replace all the 'NR' values in MW column to NaN
power_df['MW'] = power_df['MW'].replace('NR', np.nan)
power_df['MW'] = power_df['MW'].replace('nr', np.nan)
full_power_df = power_df.copy()
# drop the rest of the columns that are not required and set index to Time
power_df = power_df[['Time', 'MW']]
#power_df.dropna(subset=['Time'], inplace=True)
power_df.dropna(inplace = True)
power_df['MW'] = power_df['MW'].astype(str)
# Remove commas from 'CDH' column and convert to numeric
power_df['MW'] = pd.to_numeric(power_df['MW'].str.replace(',', '.'), errors='coerce')
power_df['Time'] = pd.to_datetime(power_df['Time'])
# Now, 'Time' column is in UTC timezone format
power_df = power_df.sort_values('Time')

In [None]:
power_df

In [None]:
plt.figure(figsize=(10, 5))
plt.ion()
plt.plot(power_df['Time'], power_df['MW'], marker='o', linestyle='-', color='r', label='MW')

plt.xlabel('Time')

plt.xticks(rotation=45)

ax = plt.gca()
ax.set_facecolor('white')
ax.set_title('power vs time (not normalized)')
plt.legend()
plt.tight_layout()

plt.show()

In [None]:
# read the temperature data csv
temperature_data_csv_path = 'D:\\mlinternship\\iitgdata\\temperaturedata'
filename = 'guwahati_temperature_data.csv'
file = os.path.join(temperature_data_csv_path, filename)
temperature_df = pd.read_csv(file)
#drop the unwanted columns and change column name
temperature_df.rename(columns={'valid': 'Time'}, inplace = True)
temperature_df = temperature_df.rename(columns={'tmpc': 'temperature'})
temperature_df = temperature_df[['Time', 'temperature']]

temperature_df['Time'] = pd.to_datetime(temperature_df['Time'])
temperature_df['Time'] = pd.DatetimeIndex(temperature_df['Time']) + timedelta(hours=5,minutes=30)
temperature_df['temperature'] = pd.to_numeric(temperature_df['temperature'], errors='coerce')


temperature_df.set_index('Time', inplace=True)
temperature_df['temperature'] = temperature_df['temperature'].interpolate(method='polynomial', order = 5)
temperature_df.reset_index(inplace=True)

In [None]:
temperature_df

In [None]:


# plotting temperature vs time
plt.figure(figsize=(10, 5))
plt.ion()
plt.plot(temperature_df['Time'], temperature_df['temperature'], marker='o', linestyle='-', color='r', label='temperature')

plt.xlabel('Time')

plt.xticks(rotation=45)

ax = plt.gca()
ax.set_facecolor('white')
ax.set_title('temperature vs time')
plt.legend()
plt.tight_layout()

plt.show()


In [None]:
common_times = set(power_df['Time']).intersection(set(temperature_df['Time']))
num_common_rows = len(common_times)

print(f'Number of rows with same values in "Time" column: {num_common_rows}')

In [None]:
# joining the two dataframes such that the temperature data is only taken if there exists a reading in the power data dataframe
df = pd.merge(power_df, temperature_df, on='Time', how='left')
DF = pd.merge(full_power_df, temperature_df, on='Time', how='left')
df['Time'] = pd.to_datetime(df['Time'])
#drop all the rows where NaN
df.dropna(inplace=True)
df = df[df['MW'] <= 20]
df = df.sort_values('Time')
df.reset_index(inplace=True, drop = True)
df


In [None]:
temporary_df = df.copy()
# Convert 'Time' column to datetime if it's not already
temporary_df['Time'] = pd.to_datetime(temporary_df['Time'])
# Set 'Time' column as the index
temporary_df.set_index('Time', inplace=True)
# Resample the data to get weekly averages
weekly_avg_temperature = temporary_df['temperature'].resample('W').mean()
# Plot the weekly average temperature
plt.figure(figsize=(12, 6))
plt.plot(weekly_avg_temperature.index, weekly_avg_temperature.values, marker='o', linestyle='-')
plt.xlabel('Week')
plt.ylabel('Average Temperature')
plt.title('Weekly Average Temperature')
plt.grid(True)
plt.show()


In [None]:
df['Month'] = df['Time'].dt.month
df['Year'] = df['Time'].dt.year

# Group by month and year, and count the number of data entries
monthly_data_counts = df.groupby(['Year', 'Month']).size().reset_index(name='DataCount')

# Sort the data counts in descending order
sorted_monthly_data_counts = monthly_data_counts.sort_values(by='DataCount', ascending=False)

# List of months with most data available in descending order
months_with_most_data = []
for index, row in sorted_monthly_data_counts.iterrows():
    month_name = pd.Timestamp(year=row['Year'], month=row['Month'], day=1).strftime('%B %Y')
    data_count = row['DataCount']
    months_with_most_data.append((month_name, data_count))

print("Months with most data available in descending order:")
for month, data_count in months_with_most_data:
    print(f"{month}: {data_count} data points")

columns_to_drop = ['Year', 'Month']
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
full_model_start_time = pd.Timestamp('2022-02-07 00:00:00')
full_model_end_time = pd.Timestamp('2022-12-25 23:00:00')
df = df.sort_values('Time')
df = df[(df['Time'] >= full_model_start_time) & (df['Time'] <= full_model_end_time)]

df.reset_index(drop=True)
df

In [None]:
correlation_start_time = pd.Timestamp('2022-07-04 00:00:00')
correlation_end_time = pd.Timestamp('2022-08-28 23:00:00')
training_mask = (df['Time'] >= correlation_start_time) & (df['Time'] <= correlation_end_time)
mw_array = np.array(df['MW'][training_mask], dtype='float64')
correlation_values = []

for i in range(20, 40):
    TcoolStPt = i
    CDH = df['temperature'][training_mask] - TcoolStPt
    CDH.clip(lower=0, inplace=True)
    CDH = pd.DataFrame(data=CDH.values, columns=['CDH'], index=df[training_mask].index)
    temp_array = np.array(CDH['CDH'], dtype='float64')
    correlation_coefficient = np.corrcoef(mw_array, temp_array)[0, 1]
    correlation_values.append((TcoolStPt, correlation_coefficient))

# Sort the correlation values list in descending order based on correlation coefficient
sorted_correlation_values = sorted(correlation_values, key=lambda x: x[1], reverse=True)

# Print the TcoolStPt values with highest correlation coefficients in descending order
for tcool_stpt, corr_coeff in sorted_correlation_values:
    print(f'TcoolStPt: {tcool_stpt}, Correlation Coefficient: {corr_coeff}')

In [None]:
'''TcoolStPt = 31
CDH = df['temperature'] - TcoolStPt
CDH.clip(lower=0, inplace=True)
CDH = pd.DataFrame(data=CDH.values, columns=['CDH'])
df = pd.concat([df, CDH['CDH']], axis=1)'''

TcoolStPt = 31
CDH = df['temperature'] - TcoolStPt
CDH.clip(lower=0, inplace=True)
CDH = pd.DataFrame(data=CDH.values, columns=['CDH'], index=df.index)
# Concatenate CDH with the original DataFrame using the index
df = pd.concat([df, CDH], axis=1)

CDH = DF['temperature'] - TcoolStPt
CDH.clip(lower=0, inplace=True)
CDH = pd.DataFrame(data=CDH.values, columns=['CDH'], index=DF.index)
# Concatenate CDH with the original DataFrame using the index
DF = pd.concat([DF, CDH], axis=1)
df

In [None]:
mw_array = np.array(df['MW'], dtype='float64')
temp_array = np.array(df['CDH'], dtype='float64')
correlation_coefficient = np.corrcoef(mw_array, temp_array)[0, 1]
print('correlation between CDH and power: ', correlation_coefficient)

In [None]:
df.dropna(inplace=True)
df = df.sort_values('Time')
df.reset_index(inplace=True, drop = True)

In [None]:

plt.ion()
df['MW'] = pd.to_numeric(df['MW'], errors = 'coerce')
f,ax = plt.subplots(nrows = 1, figsize=(5,5))
#plt.plot(df['temperature'], df['MW'])
plt.scatter(df['temperature'], df['MW'],color='red')
ax.set_title('MW vs Temperature')

power_normalized = preprocessing.normalize([df['MW']]).reshape(-1,1)
temperature_normalized = preprocessing.normalize([df['temperature']]).reshape(-1,1)

plt.figure(figsize=(10, 5))
plt.ion()
plt.plot(df['Time'], df['MW'], marker='o', linestyle='-', color='r', label='MW')
plt.plot(df['Time'], df['temperature'], marker='o', linestyle='-', color='y', label='temp')
plt.xlabel('Time')
plt.xticks(rotation=45)
ax = plt.gca()
ax.set_facecolor('white')
ax.set_title('power and temperature vs time (not normalized)')
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 5))
plt.ion()
plt.plot(df['Time'], power_normalized, marker='o', linestyle='-', color='r', label='MW')
plt.plot(df['Time'], temperature_normalized, marker='o', linestyle='-', color='y', label='temp')
plt.xlabel('Time')
plt.xticks(rotation=45)
ax = plt.gca()
ax.set_facecolor('white')
ax.set_title('power and temperature vs time (normalized)')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
numOmegas = 24 * 7
num_of_rows = df.shape[0]
omegas = np.zeros((num_of_rows, numOmegas))  # Assuming numOmegas columns for omegas
concatenated_data = np.concatenate((df, omegas), axis=1)
column_names = ['Time', 'MW', 'temperature', 'CDH']
for i in range(1, numOmegas + 1,1):
    column_names.append('omega' + str(i))

df = pd.DataFrame(concatenated_data, columns=column_names)
df['Time'] = pd.to_datetime(df['Time'])
for i in range(0,num_of_rows):
        datetime = df.Time.loc[i]
        hourOfWeekIndex = int(datetime.dayofweek*24+(datetime.hour+1))
        x = np.zeros((1,numOmegas))
        x[0,hourOfWeekIndex-1]=1
        omegas[i,:]=x

df.iloc[:,4:]=omegas
df

In [None]:
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import itertools


In [None]:
#set timeframe for training data
start_time = pd.Timestamp('2022-07-04 00:00:00')
end_time = pd.Timestamp('2022-08-28 23:00:00')
training_mask = (df['Time'] >= start_time) & (df['Time'] <= end_time) & (df['CDH'] > 0)
training_df = df[training_mask]
#training_df = training_df.dropna(subset=['CDH'])
#training_df = training_df.dropna(subset=['MW'])

x_train = training_df.loc[:, 'CDH']
y_train = training_df.loc[:, 'MW']
t_train = training_df['Time']

x_train = pd.to_numeric(x_train, errors='coerce')
x_train = x_train.values.reshape(-1, 1)
y_train = np.array(y_train)

# For testing data, use data after end_time to testing_end_time
testing_end_time = pd.Timestamp('2022-10-15 23:00:00')
testing_mask = (df['Time'] > end_time) & (df['Time'] <= testing_end_time) & (df['CDH'] > 0)
testing_df = df[testing_mask]
testing_df = testing_df.dropna(subset=['CDH'])
testing_df = testing_df.dropna(subset=['MW'])
x_test = testing_df.loc[:, 'CDH']
y_test = testing_df.loc[:, 'MW']
t_test = testing_df['Time']

In [None]:
# Linear regression
temperature_model = LinearRegression()
temperature_model.fit(x_train, y_train)

predictions = temperature_model.predict(x_train)
coef_cooling = temperature_model.coef_
CDHmodelScore = temperature_model.score(x_train,y_train)

print("score of the model is ", CDHmodelScore)

'''
plt.figure(figsize=(10, 5))
plt.scatter(x_train, y_train, color='blue', label='Actual MW')
plt.scatter(x_train, predictions, color='red', label='Predicted MW')
plt.xlabel('x_train')
plt.ylabel('MW')
plt.title('Actual vs Predicted MW (using only CDH values to predict)')
plt.legend()
plt.show()
'''

In [None]:
print('temperature model coefficient is: ', coef_cooling)

power_array = np.array(training_df['MW'], dtype='float64')
temp_array = np.array(training_df['CDH'], dtype='float64')
correlation_coefficient = np.corrcoef(power_array, temp_array)[0, 1]
print('correlation between CDH and power in training set: ', correlation_coefficient)

In [None]:
training_mask = (df['Time'] >= start_time) & (df['Time'] <= end_time)
training_df = df[training_mask]
#Linear regression for the rest of the model
x_behavior = df[training_mask].loc[:,'omega1':'omega168']
y = pd.DataFrame(df['MW'][training_mask])
yhat_c = (coef_cooling*df.loc[:,'CDH'][training_mask].values).reshape(-1,1)
y_behavior = y - (yhat_c)
# Drop rows where y_behavior has NaN values
#y_behavior = y_behavior.dropna()
# Get the index of non-null values in y_behavior and use it to filter x_behavior
#non_null_index = y_behavior.index
#x_behavior_filtered = x_behavior.loc[non_null_index]
#x_behavior= x_behavior_filtered
time = df['Time'][training_mask]

## ------ fit the behavior model -----
behavior_model = LinearRegression(fit_intercept=False, positive=True)
behavior_model.verbose=False
behavior_model.fit(x_behavior,y_behavior)
#model.coef_,model.intercept_
behavior_modelScore = behavior_model.score(x_behavior.values,y_behavior.values)
print('score of the behavior model is '+str(behavior_modelScore))
yhat_behavior = behavior_model.predict(x_behavior.values)

predicted_data = pd.DataFrame({'Time': time, 'Predicted MW': yhat_behavior.reshape(-1)})
plt.figure(figsize=(10, 5))
plt.scatter(df['Time'][training_mask], yhat_behavior, color='red', label='Predicted MW')
plt.scatter(df['Time'][training_mask], y_behavior.values, color='blue', label='Actual MW')
plt.xlabel('Time')
plt.ylabel('MW')
plt.title('Actual vs Predicted MW (using behavior model)')
plt.legend()
plt.show()

'''
plt.figure(figsize=(10, 5))

plt.scatter(time, y_behavior.values, color='blue', label='Actual MW')
plt.scatter(time, yhat_behavior, color='red', label='Predicted MW')

# Set labels and title
plt.xlabel('time')
plt.ylabel('MW')
plt.title('Actual vs Predicted MW (using behavior model)')
plt.legend()
plt.show()
'''


In [None]:
behavior_model.coef_

In [None]:
full_model_start_time = pd.Timestamp('2022-02-07 00:00:00')
full_model_end_time = pd.Timestamp('2022-12-25 23:00:00')

In [None]:
# constructing the full LR model using time and temperature models
c = np.concatenate((np.array([coef_cooling]).reshape(1,-1),behavior_model.coef_),axis=1)
full_model_mask = (df['Time'] >= full_model_start_time) & (df['Time'] <= full_model_end_time)
full_model = LinearRegression(fit_intercept=False)
full_model.coef_ = c
full_model.intercept_ = 0

final_df = df[full_model_mask]
final_df = final_df.sort_values('Time')
full_model_mask.reset_index(drop = True)
X = final_df.loc[:,'CDH':'omega168']
y = pd.DataFrame(final_df['MW'])

#y = y.dropna()
#X = X.loc[y.index]
#X = X.dropna()
#y = y.loc[X.index]
time = final_df.loc[y.index, 'Time']
yhat = full_model.predict(X.values)
pred = pd.DataFrame(data=yhat, columns=['yhat'], index=df.index)
# Concatenate CDH with the original DataFrame using the index
predictions_df = pd.concat([df, pred], axis=1)

full_modelScore = full_model.score(X,y)
print ('score for constructed full model on full data: ', full_modelScore)

fig,(ax1) = plt.subplots(nrows=1,figsize=(10,9))
_=ax1.plot(time,y,label='meas')
_=ax1.plot(time,yhat,label='pred: behavior+heat/cool')
ax1.set_title('measured vs predicted data (full constructed model)')
_=ax1.legend()

In [None]:
modelscoreontrainingdata = full_model.score(X[training_mask], y[training_mask])
print("score of the fully constructed model on training data: ", modelscoreontrainingdata)

In [None]:
# list of rows where model is predicting 0
predictions_df = predictions_df[predictions_df['yhat'] == 0]
predictions_df

In [None]:
for i in range(1,169):
    sum = predictions_df[f'omega{i}'].sum()
    if (sum!=0):
        print(f"{i}:{sum}")


In [None]:
training_df[training_df['omega45']  > 0]

In [None]:
training_df = training_df.sort_values('Time')
training_df

In [None]:
df['Time'] = pd.to_datetime(df['Time'])
hours = df['Time'].dt.hour
df_12am_to_8am = df[(hours >= 0) & (hours <= 8)]
df_excluding_12am_to_8am = df[(hours < 0) | (hours > 8)]


In [None]:
df_12am_to_8am.dropna(inplace=True)
df_12am_to_8am = df_12am_to_8am[training_mask]
x = df_12am_to_8am.loc[:,'CDH':'omega168']
y = pd.DataFrame(df_12am_to_8am['MW'])
night_score = full_model.score(x,y)
print('model score for data between 12am to 8 am on training data is: ', night_score)

In [None]:
df_12am_to_8am

In [None]:
STOP

In [None]:
start_time = pd.Timestamp('2022-10-03 00:00:00')
end_time = pd.Timestamp('2023-01-01 23:00:00')

In [None]:
training_mask = (df['Time'] >= start_time) & (df['Time'] <= end_time) & (df['CDH'] > 0)
training_df = df[training_mask]
#training_df = training_df.dropna(subset=['CDH'])
#training_df = training_df.dropna(subset=['MW'])

x_train = training_df.loc[:, 'CDH']
y_train = training_df.loc[:, 'MW']
t_train = training_df['Time']

x_train = pd.to_numeric(x_train, errors='coerce')
x_train = x_train.values.reshape(-1, 1)
y_train = np.array(y_train)
temperature_model = LinearRegression()
temperature_model.fit(x_train, y_train)

predictions = temperature_model.predict(x_train)
coef_cooling = temperature_model.coef_
CDHmodelScore = temperature_model.score(x_train,y_train)

print("score of the model is ", CDHmodelScore)

print('temperature model coefficient is: ', coef_cooling)

power_array = np.array(training_df['MW'], dtype='float64')
temp_array = np.array(training_df['CDH'], dtype='float64')
correlation_coefficient = np.corrcoef(power_array, temp_array)[0, 1]
print('correlation between CDH and power in training set: ', correlation_coefficient)


training_mask = (df['Time'] >= start_time) & (df['Time'] <= end_time)
#Linear regression for the rest of the model
x_behavior = df[training_mask].loc[:,'omega1':'omega168']
y = pd.DataFrame(df['MW'][training_mask])
yhat_c = (coef_cooling*df.loc[:,'CDH'][training_mask].values).reshape(-1,1)
y_behavior = y - (yhat_c)
# Drop rows where y_behavior has NaN values
#y_behavior = y_behavior.dropna()
# Get the index of non-null values in y_behavior and use it to filter x_behavior
#non_null_index = y_behavior.index
#x_behavior_filtered = x_behavior.loc[non_null_index]
#x_behavior= x_behavior_filtered
time = df['Time'][training_mask]

## ------ fit the behavior model -----
behavior_model = LinearRegression(fit_intercept=False, positive=True)
behavior_model.verbose=False
behavior_model.fit(x_behavior,y_behavior)
#model.coef_,model.intercept_
behavior_modelScore = behavior_model.score(x_behavior.values,y_behavior.values)
print('score of the behavior model is '+str(behavior_modelScore))
yhat_behavior = behavior_model.predict(x_behavior.values)

full_model_start_time = pd.Timestamp('2022-02-07 00:00:00')
full_model_end_time = pd.Timestamp('2022-12-25 23:00:00')
# constructing the full LR model using time and temperature models
c = np.concatenate((np.array([coef_cooling]).reshape(1,-1),behavior_model.coef_),axis=1)
full_model_mask = (df['Time'] >= full_model_start_time) & (df['Time'] <= full_model_end_time)
full_model = LinearRegression(fit_intercept=False)
full_model.coef_ = c
full_model.intercept_ = 0

final_df = df[full_model_mask]
final_df = final_df.sort_values('Time')
full_model_mask.reset_index(drop = True)
X = final_df.loc[:,'CDH':'omega168']
y = pd.DataFrame(final_df['MW'])

#y = y.dropna()
#X = X.loc[y.index]
#X = X.dropna()
#y = y.loc[X.index]
time = final_df.loc[y.index, 'Time']
yhat = full_model.predict(X.values)
pred = pd.DataFrame(data=yhat, columns=['yhat'], index=df.index)
# Concatenate CDH with the original DataFrame using the index
predictions_df = pd.concat([df, pred], axis=1)

full_modelScore = full_model.score(X,y)
print ('score for constructed full model: ', full_modelScore)

fig,(ax1) = plt.subplots(nrows=1,figsize=(10,9))
_=ax1.plot(time,y,label='meas')
_=ax1.plot(time,yhat,label='pred: behavior+heat/cool')
ax1.set_title('measured vs predicted data (full constructed model)')
_=ax1.legend()

In [None]:
modelscoreontrainingdata = full_model.score(X[training_mask], y[training_mask])
print("score of the fully constructed model on training data: ", modelscoreontrainingdata)

In [None]:
trials

In [None]:
full_model_mask = (df['Time'] >= full_model_start_time) & (df['Time'] <= full_model_end_time)
x = df[full_model_mask]['CDH'].values.reshape(-1,1)
y = df[full_model_mask]['MW'].values.reshape(-1,1)


In [None]:
temperature_model = LinearRegression()
temperature_model.fit(x, y)

predictions = temperature_model.predict(x_train)
coef_cooling = temperature_model.coef_
CDHmodelScore = temperature_model.score(x_train,y_train)

print("coeff of the model is ", temperature_model.coef_)

In [None]:
new_start_time  = pd.Timestamp('2022-07-09 12:00:00')
new_end_time = pd.Timestamp('2022-07-09 15:00:00')
new_model_mask = (df['Time'] >= new_start_time) & (df['Time'] <= new_end_time) & (df['CDH'] > 0)
x = df[new_model_mask]['CDH'].values.reshape(-1,1)
y = df[new_model_mask]['MW'].values.reshape(-1,1)
x = np.array([1.0, 2.5, 3.8]).reshape(-1, 1)
y = np.array([4.0, 4.5, 5.0]).reshape(-1, 1)


In [None]:
x

In [None]:
model = LinearRegression()
model.fit(x,y)
model.coef_

In [None]:
power_array = np.array(x, dtype='float64')
temp_array = np.array(y, dtype='float64')
correlation_coefficient = np.corrcoef(x, y)[0, 1]
print('correlation between CDH and power in training set: ', correlation_coefficient)


In [None]:
temp_array