<a href="https://colab.research.google.com/github/jiraiyam/Kaggle-projects-/blob/main/Chiller_energy_EDA_and_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'chiller-energy-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1516081%2F2503617%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240902%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240902T221657Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D73c23ddbd8e5a6f7b060e95722bda4605f5e6c3ca2326bffb623b152e3aad02ee96f7534bc5252afb36f0790a3cd3b2b1cf4252408834a66cfb250be9d9eb321ac0e7480470a2f7660178e9442fe717bfa3e9e7e2c1e9d569abc391c631da302309f859f1b8e3fe15f20844f2da7422d420f3bd239597e9d6f6a3fa5cb71dfc2f0ba0b6048a2133798c5fc1941c0e20a7d554b6dd12fa2c2e34b4d5755128412dd0e19f6878780482be626bb483e525994d4e861e2b3fb5b5797c66c06d409587b8297e439c7342c146021fcdca33253824db3b874d8aa4d46a1dd25ec19c42c924b65c4caac1fbe820d4d1ed2ca9765a61c6d6115d9d66355e51558e1498382'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from pylab import rcParams
import warnings
import seaborn as sns
rcParams["figure.figsize"]=(30,18)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.weight'] = 'bold'
plt.rcParams['font.size'] = 20
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        Path=os.path.join(dirname, filename)
        print(Path)

In [None]:
df=pd.read_csv(Path)
df

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df['Local Time'] = pd.to_datetime(df['Local Time (Timezone : GMT+8h)'], format='%m/%d/%Y %H:%M')
df

In [None]:
plt.plot(df['Local Time'], df['Building Load (RT)'], label='Building Load')
plt.xlabel('Time')
plt.ylabel('Building Load (RT)')
plt.title('Building Load Over Time')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
df.info()

# Pearson correlation between Building Load and Chiller Energy Consumption
Importance: Measures the linear relationship between two continuous variables. It helps in understanding the strength and direction of the relationship.


In [None]:
from scipy.stats import pearsonr

corr, p_value = pearsonr(df['Building Load (RT)'], df['Chiller Energy Consumption (kWh)'])
print(f'Pearson Correlation: {corr:.2f}, p-value: {p_value:.4f}')


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Building Load (RT)', y='Chiller Energy Consumption (kWh)', data=df)
plt.xlabel('Building Load (RT)')
plt.ylabel('Chiller Energy Consumption (kWh)')
plt.title('Pearson Correlation between Building Load and Chiller Energy Consumption')
plt.grid(True)
plt.show()


# Spearman rank correlation between Building Load and Outside Temperature
Importance: Measures the monotonic relationship between two variables. Useful when the relationship is not linear but still has a consistent order.


In [None]:
from scipy.stats import spearmanr

corr, p_value = spearmanr(df['Building Load (RT)'], df['Outside Temperature (F)'])
print(f'Spearman Rank Correlation: {corr:.2f}, p-value: {p_value:.4f}')


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Building Load (RT)', y='Chiller Energy Consumption (kWh)', data=df)
plt.xlabel('Building Load (RT)')
plt.ylabel('Chiller Energy Consumption (kWh)')
plt.title('Pearson Correlation between Building Load and Chiller Energy Consumption')
plt.grid(True)
plt.show()



**Linear Regression
**

Importance: Helps in understanding the relationship between an independent variable (predictor) and a dependent variable (outcome). It can also be used to make predictions.




In [None]:
import statsmodels.api as sm

# Prepare data for regression
X = df[['Outside Temperature (F)']]  # Independent variable
X = sm.add_constant(X)  # Add constant for intercept
y = df['Building Load (RT)']  # Dependent variable

# Fit the model
model = sm.OLS(y, X).fit()
print(model.summary())


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Outside Temperature (F)', y='Building Load (RT)', data=df, label='Data')
sns.lineplot(x=df['Outside Temperature (F)'], y=model.predict(X), color='red', label='Regression Line')
plt.xlabel('Outside Temperature (F)')
plt.ylabel('Building Load (RT)')
plt.title('Linear Regression of Building Load on Outside Temperature')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from scipy.stats import chi2_contingency

temperature_bins = pd.cut(df['Outside Temperature (F)'], bins=5)
humidity_bins = pd.cut(df['Humidity (%)'], bins=5)
contingency_table = pd.crosstab(temperature_bins, humidity_bins)

# Perform Chi-Square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f'Chi-Square: {chi2:.2f}, p-value: {p_value:.4f}')


In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(contingency_table, annot=True, cmap='YlGnBu', fmt='d')
plt.title('Chi-Square Test Contingency Table')
plt.xlabel('Humidity Bins')
plt.ylabel('Temperature Bins')
plt.show()


In [None]:
df['Local Time (Timezone : GMT+8h)'] = pd.to_datetime(df['Local Time (Timezone : GMT+8h)'], format='%m/%d/%Y %H:%M')

In [None]:
df.info()

**Feature Engineering**

In [None]:
df['Year'] = df['Local Time (Timezone : GMT+8h)'].dt.year
df['Month'] = df['Local Time (Timezone : GMT+8h)'].dt.month
df['Day'] = df['Local Time (Timezone : GMT+8h)'].dt.day
df['Hour'] = df['Local Time (Timezone : GMT+8h)'].dt.hour
df['DayOfWeek'] = df['Local Time (Timezone : GMT+8h)'].dt.dayofweek
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_cols = ['Chilled Water Rate (L/sec)', 'Cooling Water Temperature (C)', 'Building Load (RT)', 'Chiller Energy Consumption (kWh)', 'Outside Temperature (F)', 'Dew Point (F)', 'Humidity (%)', 'Wind Speed (mph)', 'Pressure (in)']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

df.set_index('Local Time (Timezone : GMT+8h)', inplace=True)
result = seasonal_decompose(df['Building Load (RT)'], model='additive', period=24)

result.plot()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X = df[['Chilled Water Rate (L/sec)', 'Cooling Water Temperature (C)', 'Outside Temperature (F)', 'Dew Point (F)', 'Humidity (%)', 'Wind Speed (mph)', 'Pressure (in)']]
y = df['Building Load (RT)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import statsmodels.api as sm
X = sm.add_constant(X_train)
model = sm.OLS(y_train, X).fit()
residuals = y_train - model.predict(X)
plt.figure(figsize=(10, 6))
plt.scatter(model.predict(X), residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


In [None]:
from pandas.plotting import lag_plot
lag_plot(df['Building Load (RT)'])
plt.title('Lag Plot of Building Load')
plt.show()


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = df[['Chilled Water Rate (L/sec)', 'Cooling Water Temperature (C)', 'Outside Temperature (F)', 'Dew Point (F)', 'Humidity (%)', 'Wind Speed (mph)', 'Pressure (in)']]
y = df['Building Load (RT)']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

importances = model.feature_importances_

if len(X.columns) == len(importances):
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    })
    importance_df.sort_values(by='Importance', ascending=False, inplace=True)
    print(importance_df)
else:
    print("Mismatch between number of features and importance scores")
    print("Features:", len(X.columns))
    print("Importances:", len(importances))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Feature Importances from RandomForestRegressor')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

top_features = importance_df.head(3)['Feature'].tolist()
X_top = X[top_features]

X_train_top, X_test_top, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

model_top = GradientBoostingRegressor()
model_top.fit(X_train_top, y_train)

y_pred = model_top.predict(X_test_top)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error with Top Features: {mse}')


In [None]:
df.set_index('Local Time', inplace=True)

In [None]:
df.sort_index(inplace=True)


In [None]:
plt.figure(figsize=(14, 7))
plt.plot(df.index, df['Building Load (RT)'], label='Building Load (RT)', color='blue')
plt.xlabel('Date', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('Building Load (RT)', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label
plt.title('Building Load Over Time', fontsize=16, fontweight='bold')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(14, 7))
plt.plot(df.index, df['Chiller Energy Consumption (kWh)'], label='Chiller Energy Consumption (kWh)', color='red')
plt.xlabel('Date', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('Chiller Energy Consumption (kWh)', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label
plt.title('Chiller Energy Consumption Over Time', fontsize=16, fontweight='bold')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)  # Rotate x-axis ticks
plt.show()


In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plt.figure(figsize=(14, 7))
plt.subplot(121)
plot_acf(df['Building Load (RT)'].dropna(), lags=50, ax=plt.gca())
plt.title('ACF of Building Load (RT)', fontsize=16, fontweight='bold')
plt.xlabel('Lags', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('ACF', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label

plt.subplot(122)
plot_pacf(df['Building Load (RT)'].dropna(), lags=50, ax=plt.gca())
plt.title('PACF of Building Load (RT)', fontsize=16, fontweight='bold')
plt.xlabel('Lags', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('PACF', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label

plt.tight_layout()
plt.show()


In [None]:
rolling_mean = df['Building Load (RT)'].rolling(window=24).mean()
rolling_std = df['Building Load (RT)'].rolling(window=24).std()

plt.figure(figsize=(14, 7))
plt.plot(df.index, df['Building Load (RT)'], label='Building Load (RT)', color='blue')
plt.plot(df.index, rolling_mean, label='Rolling Mean (24 hours)', color='orange')
plt.plot(df.index, rolling_std, label='Rolling Std Dev (24 hours)', color='green')
plt.xlabel('Date', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('Building Load (RT)', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label
plt.title('Rolling Mean and Standard Deviation of Building Load (RT)', fontsize=16, fontweight='bold')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

In [None]:
import seaborn as sns

heatmap_data = df.pivot_table(index='Day', columns='Hour', values='Building Load (RT)', aggfunc='mean')

sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt=".2f")
plt.title('Heatmap of Building Load (RT) by Hour and Day', fontsize=16, fontweight='bold')
plt.xlabel('Hour', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('Day', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label
plt.xticks(rotation=45)
plt.show()


In [None]:
# Calculate correlation matrix
correlation_matrix = df[['Chilled Water Rate (L/sec)', 'Cooling Water Temperature (C)', 'Building Load (RT)',
                         'Chiller Energy Consumption (kWh)', 'Outside Temperature (F)', 'Dew Point (F)',
                         'Humidity (%)', 'Wind Speed (mph)', 'Pressure (in)']].corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap of Features', fontsize=16, fontweight='bold')
plt.xlabel('Feature', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('Feature', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label
plt.xticks(rotation=45)  # Rotate x-axis ticks
plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Prepare data for clustering
X = df[['Building Load (RT)']].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit KMeans
kmeans = KMeans(n_clusters=3, random_state=0)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to DataFrame
df['Cluster'] = clusters

# Plot clusters
plt.figure(figsize=(14, 7))
for cluster in range(3):
    cluster_data = df[df['Cluster'] == cluster]
    plt.plot(cluster_data.index, cluster_data['Building Load (RT)'], label=f'Cluster {cluster}')

plt.xlabel('Date', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('Building Load (RT)', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label
plt.title('Time Series Clusters of Building Load (RT)', fontsize=16, fontweight='bold')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)  # Rotate x-axis ticks
plt.show()


In [None]:
from sklearn.ensemble import IsolationForest
from scipy.stats import zscore

iso_forest = IsolationForest(contamination=0.01, random_state=0)
outliers = iso_forest.fit_predict(X_scaled)

plt.figure(figsize=(14, 7))
plt.plot(df.index, df['Building Load (RT)'], label='Building Load (RT)', color='blue')
plt.scatter(df.index[outliers == -1], df['Building Load (RT)'][outliers == -1], color='red', label='Anomalies')
plt.xlabel('Date', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('Building Load (RT)', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label
plt.title('Anomaly Detection in Building Load (RT)', fontsize=16, fontweight='bold')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)  # Rotate x-axis ticks
plt.show()


In [None]:
from statsmodels.tsa.stattools import ccf

cross_corr = ccf(df['Building Load (RT)'], df['Chilled Water Rate (L/sec)'])

plt.figure(figsize=(14, 7))
plt.stem(range(len(cross_corr)), cross_corr, use_line_collection=True)
plt.xlabel('Lag', fontsize=14, fontweight='bold')
plt.ylabel('Cross-Correlation', fontsize=14, fontweight='bold')
plt.title('Cross-Correlation between Building Load (RT) and Chilled Water Rate (L/sec)', fontsize=16, fontweight='bold')
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[['Building Load (RT)']])

def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = data[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

SEQ_LENGTH = 24
X, y = create_sequences(scaled_data, SEQ_LENGTH)

split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

model = Sequential([
    LSTM(50, activation='relu', input_shape=(SEQ_LENGTH, 1)),
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

predictions = model.predict(X_test)
predictions = scaler.inverse_transform(predictions)
y_test = scaler.inverse_transform(y_test.reshape(-1, 1))

plt.figure(figsize=(14, 7))
plt.plot(df.index[SEQ_LENGTH + split:], y_test, label='Actual', color='blue')
plt.plot(df.index[SEQ_LENGTH + split:], predictions, label='Predicted', color='red')
plt.xlabel('Date', fontsize=14, fontweight='bold', rotation=45)  # Rotate x-axis label
plt.ylabel('Building Load (RT)', fontsize=14, fontweight='bold', rotation=90)  # Rotate y-axis label
plt.title('LSTM Forecast vs Actual', fontsize=16, fontweight='bold')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)  # Rotate x-axis ticks
plt.show()
