### Preprocess Climate Data

In [33]:
import pandas as pd

climate_df = pd.read_csv('data/climate_data.csv')
climate_df['date'] = pd.to_datetime(climate_df['date'])
climate_df.index = climate_df['date']

hypothetical_redundant_cols = ["sunrise", "sunset"]
climate_df = climate_df.drop(columns=hypothetical_redundant_cols)

In [34]:
from sklearn.preprocessing import MinMaxScaler

numeric_cols = climate_df.select_dtypes(include=['float64', 'int64']).columns.to_list()
numeric_cols.remove('lat')
numeric_cols.remove('lon')
non_numeric_cols = climate_df.select_dtypes(exclude=['float64', 'int64']).columns.to_list() + ['lat', 'lon']

climate_df[numeric_cols] = climate_df[numeric_cols].astype(float)
climate_df[non_numeric_cols] = climate_df[non_numeric_cols]
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(climate_df[numeric_cols]), columns=numeric_cols, index=climate_df.index)

climate_data = pd.concat([scaled_df, climate_df[non_numeric_cols]], axis=1).reset_index(drop= True)

### Preprocess Energy Data

In [37]:
energy_df = pd.read_csv('data/energy_consumption_data.csv')
energy_df['Date'] = pd.to_datetime(energy_df['Date'])
energy_df.index = energy_df['Date']

redundant_cols = ["Category", "Variable", "Unit", "Value"]
energy_df = energy_df.drop(columns=redundant_cols)

In [38]:
numeric_cols = energy_df.select_dtypes(include=['float64', 'int64']).columns
non_numeric_cols = energy_df.select_dtypes(exclude=['float64', 'int64']).columns
energy_df[numeric_cols] = energy_df[numeric_cols].astype(float)

scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(energy_df[numeric_cols]), columns=numeric_cols, index=energy_df.index)
energy_data = pd.concat([scaled_df, energy_df[non_numeric_cols]], axis=1).reset_index(drop=True)

### State-Lat-Lon Map

In [46]:
state_to_grid = {
    'Alabama': (30, -90),
    'Alaska': (50, -120),
    'Arizona': (30, -110),
    'Arkansas': (30, -90),
    'California': (30, -120),
    'Colorado': (40, -110),
    'Connecticut': (40, -70),
    'Delaware': (40, -80),
    'Florida': (30, -80),
    'Georgia': (30, -80),
    'Hawaii': (30, -120),
    'Idaho': (40, -120),
    'Illinois': (40, -90),
    'Indiana': (40, -90),
    'Iowa': (40, -100),
    'Kansas': (40, -100),
    'Kentucky': (40, -90),
    'Louisiana': (30, -90),
    'Maine': (40, -70),
    'Maryland': (40, -80),
    'Massachusetts': (40, -70),
    'Michigan': (40, -80),
    'Minnesota': (50, -100),
    'Mississippi': (30, -90),
    'Missouri': (40, -90),
    'Montana': (50, -110),
    'Nebraska': (40, -100),
    'Nevada': (40, -120),
    'New Hampshire': (40, -70),
    'New Jersey': (40, -80),
    'New Mexico': (30, -110),
    'New York': (40, -80),
    'North Carolina': (40, -80),
    'North Dakota': (50, -100),
    'Ohio': (40, -80),
    'Oklahoma': (30, -100),
    'Oregon': (40, -120),
    'Pennsylvania': (40, -80),
    'Rhode Island': (40, -70),
    'South Carolina': (30, -80),
    'South Dakota': (50, -100),
    'Tennessee': (30, -90),
    'Texas': (30, -100),
    'Utah': (40, -110),
    'Vermont': (40, -70),
    'Virginia': (40, -80),
    'Washington': (50, -120),
    'West Virginia': (40, -80),
    'Wisconsin': (40, -90),
    'Wyoming': (40, -110),
    'Puerto Rico': (18, -70),
    'Washington, D.C.': (38, -80),
}

In [None]:
energy_data = energy_data[energy_data['State'] != 'US Total']
energy_data['lat'] = energy_data['State'].map(lambda x: state_to_grid[x][0])
energy_data['lon'] = energy_data['State'].map(lambda x: state_to_grid[x][1])

energy_data['Date'] = pd.to_datetime(energy_data['Date'])
climate_data['date'] = pd.to_datetime(climate_data['date'])

In [57]:
energy_data

Unnamed: 0,Consumption,State,Date,lat,lon
0,0.026894,Alabama,2020-01-01,30,-90
1,0.024904,Alabama,2020-02-01,30,-90
2,0.024720,Alabama,2020-03-01,30,-90
3,0.021396,Alabama,2020-04-01,30,-90
4,0.025344,Alabama,2020-05-01,30,-90
...,...,...,...,...,...
1903,0.010041,Wyoming,2022-08-01,40,-110
1904,0.009388,Wyoming,2022-09-01,40,-110
1905,0.009670,Wyoming,2022-10-01,40,-110
1906,0.009286,Wyoming,2022-11-01,40,-110


## PreTrain on Climate Data

### Time-Series Sequences

In [None]:
import numpy as np

def lagged_climate_input(data, seq_length=28):
    X = []
    for _, group in data.groupby(["lat", "lon"]):
        group = group.sort_values("date")
        features = group.drop(columns=["date", "lat", "lon"])
        for row in range(len(features) - seq_length):
            X.append(features.iloc[row:row+seq_length].values)
    return np.array(X)

climate_X = lagged_climate_input(climate_data, 14)

### Model

In [None]:
import torch
import torch.nn as nn

class ClimatePreTrainer(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=1):
        super(ClimatePreTrainer, self).__init__()
        self.encoder = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.decoder = nn.LSTM(hidden_size * 2, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, input_size)
        
    def forward(self, x):
        encoded_x, _ = self.encoder(x)
        decoded_x, _ = self.decoder(encoded_x)
        reconstructed_x = self.fc(decoded_x)
        return reconstructed_x
                

In [None]:
import pickle
from torch.utils.data import DataLoader, TensorDataset

num_epochs = 1
batch_size = 32
lr = 0.001

climate_X_tensor = torch.tensor(climate_X, dtype=torch.float32)
train_dataset = TensorDataset(climate_X_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
climate_pretrained_model = ClimatePreTrainer(input_size=climate_X.shape[2]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(climate_pretrained_model.parameters(), lr=lr)
climate_pretrained_model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = batch[0].to(device)
        optimizer.zero_grad()
        outputs = climate_pretrained_model(batch)
        loss = criterion(outputs, batch)
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

with open('climate_pretrained_climate_pretrained_model.pkl', 'wb') as f:
    pickle.dump(climate_pretrained_model, f)


## FineTune Downstream

### Merge DataSources

In [None]:
climate_df["month"] = climate_df["date"].dt.month
climate_df = climate_df.drop(columns=["date"])
climate_monthly = climate_df.groupby(["lat", "lon", "month"]).mean().reset_index()
climate_monthly["month"] = climate_monthly["month"].dt.to_timestamp()


In [None]:
energy_df["month"] = energy_df["Date"].dt.month
energy_df = energy_df.drop(columns=["Date"])

merged_df = pd.merge(
    energy_df,
    climate_monthly,
    how="left",
    left_on=["lat", "lon", "month"],
    right_on=["lat", "lon", "month"],
    suffixes=("", "_climate"),
)
merged_df = merged_df.drop(columns=["month"])

In [None]:
def lagged_merged_data(data, seq_length=28):
    X = []
    y = []
    for _, group in data.groupby(["lat", "lon"]):
        group = group.drop(columns=["lat", "lon"])
        features = group.sort_values("month")
        for row in range(len(features) - seq_length):
            X.append(features.iloc[row:row+seq_length].values)
            y.append(features.iloc[row+seq_length].values)
    return np.array(X), np.array(y)

merged_X, targt_y = lagged_merged_data(merged_df, 14)

In [None]:
class EnergyPrediction(nn.Module):
    def __init__(self, encoder, input_size, hidden_size=64, num_layers=1):
        super(EnergyPrediction, self).__init__()
        self.encoder = encoder.encoder
        self.regressor = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_layers)
        )

    def forward(self, x):
        encoded_x, _ = self.encoder(x)
        x = self.regressor(encoded_x[:, -1, :])
        return x


In [None]:
num_epochs = 1
batch_size = 32
lr = 0.001

train_dataset = TensorDataset(torch.tensor(merged_X, dtype=torch.float32), torch.tensor(targt_y, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

energy_model = EnergyPrediction(climate_pretrained_model, input_size=merged_X.shape[2]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(energy_model.parameters(), lr=lr)
energy_model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        inputs, targets = batch[0].to(device), batch[1].to(device)
        optimizer.zero_grad()
        outputs = energy_model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

In [None]:
for param in energy_model.encoder.parameters():
    param.requires_grad = True

for epoch in range(num_epochs):
    for batch in train_loader:
        inputs, targets = batch[0].to(device), batch[1].to(device)
        optimizer.zero_grad()
        outputs = energy_model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")