<a href="https://colab.research.google.com/github/lingxiaoW/Ph.D._Advising_LX/blob/main/Hoang/Code/Time-series%20Prediction/createcustomdataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Define the dataset **

In [18]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset, DataLoader, random_split

#Define the dataset
class TimeSeriesDataset(Dataset):
  def __init__(self, file_paths, window_size, feature_col="temperature"):
    self.file_paths = file_paths
    self.window_size = window_size
    self.feature_col = feature_col
    self.data, self.mean, self.std = self._load_data_from_files(file_paths, window_size, feature_col)

  def _load_data_from_files(self, file_paths, window_size, feature_col='temperature'):
    """
    Load time series data from CSV files, normalize it, and create backward sequences.

    Parameters:
    - file_paths (list): List of paths to CSV files.
    - window_size (int): Number of timesteps in each input sequence.
    - feature_col (str): Name of the column in the CSV to use as the time series feature.

    Returns:
    - data (list): List of tuples containing (input_sequence, target_value).
    - mean (float): Mean used for normalization.
    - std (float): Standard deviation used for normalization.
    """
    data = []
    mean = None
    std = None

    for file_path in file_paths:
      #Load CSV file
      try:
        df = pd.read_csv(file_path, parse_dates=['date'])
        print('Data loaded successfully!')
        print(df.head())
        print('Columns in dataset:', df.columns.tolist())
      except FileNotFoundError:
        print(f"File not found: {file_path}")
        continue # Skip to the next file if not found
      except Exception as e:
          print(f"File not found: {file_path}")
          continue # Skip to the next file if not found

      #Extract the time series ata
      if feature_col not in df.columns:
          print(f"Feature column '{feature_col}' not found in the dataset in file {file_path}.")
          continue # Skip to the next file if feature column is not found
      time_series = df[feature_col].values.astype(np.float32)

      # Normalize the data
      mean = np.mean(time_series)
      std = np.std(time_series)
      if std == 0:
          print('Standard deviation is zero. Cannot normalize the data for file {file_path}.')
          continue # Skip to the next file if std is zero
      time_series = (time_series - mean) / std
      print(f'Normalized with mean={mean:.2f}, std={std:.2f} for file {file_path}')

      #Create time series sequences
      for i in range(window_size, len(time_series)):
        input_seq = time_series[i-(window_size-1): i+1]
        target_value = time_series[i]
        data.append((input_seq, target_value))
    return data, mean, std

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    input_seq, target_value = self.data[idx]
    return torch.tensor(input_seq, dtype=torch.float32), torch.tensor(target_value, dtype=torch.float32)

In [20]:
#Parameters
file_path = ['/content/daily_temperature.csv']
window_size = 4
feature_col = 'temperature'

#create PyTorch dataset
dataset = TimeSeriesDataset(file_path, window_size, feature_col)

#Split dataset for training and testing
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

#create DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Data loaded successfully!
        date  temperature
0 2021-01-01         30.5
1 2021-01-02         31.0
2 2021-01-03         32.1
3 2021-01-04         31.8
4 2021-01-05         33.0
Columns in dataset: ['date', 'temperature']
Normalized with mean=35.58, std=2.80 for file /content/daily_temperature.csv
