<a href="https://colab.research.google.com/github/killerbeelsl/weather_temperature_prediction_with_transformer/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [156]:
df = pd.read_csv('output_merged_no_filter_5000.csv')
df = df.sort_values(by=['ID', 'YEAR', 'MONTH'])

In [199]:
df = df[:5000]

In [200]:
# Remove rows where 'VALUE' is -9999
df = df[df['VALUE'] != -9999]

In [212]:
data = df.copy()

In [213]:
data["VALUE"] = data["VALUE"].apply(lambda x: x/100)

In [214]:
data

Unnamed: 0,ID,YEAR,MONTH,VALUE,DMFLAG,QCFLAG,LATITUDE,LONGITUDE,STNELEV,NAME
0,ACW00011604,1961,1,-0.78,,,1.000000,0.553626,0.003992,SAVE
1,ACW00011604,1961,2,2.47,,,1.000000,0.553626,0.003992,SAVE
2,ACW00011604,1961,3,4.83,,,1.000000,0.553626,0.003992,SAVE
3,ACW00011604,1961,4,7.84,,,1.000000,0.553626,0.003992,SAVE
4,ACW00011604,1961,5,11.39,,,1.000000,0.553626,0.003992,SAVE
...,...,...,...,...,...,...,...,...,...,...
4995,ASN00014507,1961,1,28.93,E,,0.364042,0.931293,0.004191,ALYANGULA_POLICE
4996,ASN00014507,1961,2,29.25,E,,0.364042,0.931293,0.004191,ALYANGULA_POLICE
4997,ASN00014507,1961,3,28.71,E,,0.364042,0.931293,0.004191,ALYANGULA_POLICE
4998,ASN00014507,1961,4,27.48,E,,0.364042,0.931293,0.004191,ALYANGULA_POLICE


In [215]:
# 1. Scale numerical features
scaler = MinMaxScaler()
numerical_features = ['LATITUDE', 'LONGITUDE', 'STNELEV']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

unchanged_features = ['YEAR', 'MONTH']


In [223]:
value_scaler = MinMaxScaler()
value_transformed = value_scaler.fit_transform(data['VALUE'].values.reshape(-1, 1)).reshape(-1)

In [224]:
# 2. One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # sparse=False for array output
categorical_features = ['DMFLAG', 'QCFLAG']  # Include 'ID' for station embeddings
encoded_features = encoder.fit_transform(data[categorical_features])

In [225]:
data = data.drop(columns=['VALUE'])

In [226]:
X = np.concatenate([data[unchanged_features].values,encoded_features,
                   data[numerical_features].values], axis=1)
y = value_transformed  # Target variable

In [227]:
y

array([0.2284, 0.2934, 0.3406, ..., 0.8182, 0.7936, 0.7402])

In [228]:
num_features = X[1].shape[0]

In [229]:
num_features

10

In [230]:
# --- Sequence creation ---
sequence_length = 12  # Example sequence length (12 months)

X_sequences = []
y_sequences = []
for i in range(sequence_length, len(X)):
    X_sequences.append(X[i - sequence_length:i])
    y_sequences.append(y[i])

X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)

In [231]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_sequences, y_sequences, test_size=0.2, random_state=42
)

# Further split training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.1, random_state=42
)

In [232]:
batch_size = 64
num_epochs = 10

In [233]:
# --- Create PyTorch Datasets and DataLoaders ---
train_data = TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train))
val_data = TensorDataset(torch.Tensor(X_val), torch.Tensor(y_val))
test_data = TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test))


train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)


In [234]:
# --- Define the Transformer model ---
class TransformerModel(nn.Module):
    def __init__(self, num_features, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(num_features, d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer,
 num_layers)
        self.decoder = nn.Linear(d_model, 1)  # Output layer for temperature prediction

    def forward(self, src):
        src = self.embedding(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output[:, -1, :])  # Take the output from the last time step
        return output


In [235]:
# --- Model instantiation and training ---
model = TransformerModel(num_features=X_train.shape[2], d_model=512, nhead=8, num_layers=6, dim_feedforward=2048)

In [236]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [237]:
# --- Validation loop ---
def validate(model, val_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # No need to calculate gradients during validation
        for src, target in val_loader:
            output = model(src)
            loss = criterion(output, target)
            total_loss += loss.item()
    return total_loss / len(val_loader)

In [238]:
# --- Evaluation ---
def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_targets = []
    with torch.no_grad():
        for src, target in test_loader:
            output = model(src)
            loss = criterion(output, target)
            total_loss += loss.item()
            all_predictions.extend(output.tolist())  # Store predictions
            all_targets.extend(target.tolist())  # Store true values

    # Calculate evaluation metrics (e.g., RMSE)
    rmse = np.sqrt(np.mean((np.array(all_predictions) - np.array(all_targets))**2))
    return total_loss / len(test_loader), rmse

In [239]:
# --- Training loop with validation ---
for epoch in range(num_epochs):
    # Training loop
  for src, target in train_loader:
      optimizer.zero_grad()
      output = model(src)
      loss = criterion(output, target)
      loss.backward()
      optimizer.step()
  val_loss = validate(model, val_loader, criterion)
  print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}')

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10], Validation Loss: 10.6859
Epoch [2/10], Validation Loss: 0.0247
Epoch [3/10], Validation Loss: 0.0249
Epoch [4/10], Validation Loss: 0.0258
Epoch [5/10], Validation Loss: 0.0275
Epoch [6/10], Validation Loss: 0.0238
Epoch [7/10], Validation Loss: 0.0273
Epoch [8/10], Validation Loss: 0.0275
Epoch [9/10], Validation Loss: 0.0246
Epoch [10/10], Validation Loss: 0.0282


In [240]:
# --- Final evaluation ---
test_loss, rmse = evaluate(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.4f}, RMSE: {rmse:.4f}')

Test Loss: 0.0262, RMSE: 0.1614


  return F.mse_loss(input, target, reduction=self.reduction)


In [243]:
original_min = value_scaler.data_min_[0]
original_max = value_scaler.data_max_[0]

In [244]:
original_min, original_max

(-12.2, 37.8)

In [245]:
original_rmse = rmse * (original_max - original_min) + original_min

In [246]:
original_rmse

-4.130144317361564