In [1]:
import pickle, os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import math

In [4]:
print(torch.cuda.is_available())

True


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## 1. data preparation

In [6]:
data_2015 = pd.read_csv('data/2015.csv')
data_2016 = pd.read_csv('data/2016.csv')
data_2017 = pd.read_csv('data/2017.csv')
data_2018 = pd.read_csv('data/2018.csv')
data_2019 = pd.read_csv('data/2019.csv')
data_2020 = pd.read_csv('data/2020.csv')
data_2021 = pd.read_csv('data/2021.csv')
data_2022 = pd.read_csv('data/2022.csv')

In [7]:
#load date to train, valid and test
train_df = pd.concat([data_2015, data_2016, data_2017, data_2018, data_2019], axis=0, ignore_index=True)
valid_df = pd.concat([data_2020, data_2021], axis=0, ignore_index=True)
test_df = data_2022

In [8]:
#chec the date in df which have more than 3 missising
flab = ['Temp Flag', 'Rel Hum Flag', 'Dew Point Temp Flag', 'Precip. Amount Flag', 'Stn Press Flag', 'Wind Spd Flag']
print("----------------------tain--------------------------")
date_t = set()
for i in flab:
  count1 = train_df[train_df[i] == 'M'].groupby(['Year', 'Month', 'Day'])[i].count()
  count1 = count1[count1 > 3]
  for j in count1.keys():
    date_t.add(j)
print(date_t)
print("----------------------vali--------------------------")
date_v = set()
for i in flab:
  count1 = valid_df[valid_df[i] == 'M'].groupby(['Year', 'Month', 'Day'])[i].count()
  count1 = count1[count1 > 3]
  for j in count1.keys():
    date_v.add(j)
print(date_v)
print("----------------------test--------------------------")
date_s = set()
for i in flab:
  count1 = test_df[test_df[i] == 'M'].groupby(['Year', 'Month', 'Day'])[i].count()
  count1 = count1[count1 > 3]
  for j in count1.keys():
    date_s.add(j)
print(date_s)

----------------------tain--------------------------
{(2019, 6, 5), (2019, 6, 2), (2019, 5, 24), (2015, 12, 10), (2015, 4, 6), (2015, 12, 13), (2015, 12, 7), (2015, 11, 5), (2019, 6, 1), (2019, 6, 4), (2019, 6, 19), (2019, 5, 17), (2015, 12, 6), (2015, 12, 12), (2019, 5, 23), (2015, 12, 9), (2015, 11, 4), (2019, 6, 3), (2019, 6, 18), (2016, 12, 19), (2015, 12, 14)}
----------------------vali--------------------------
{(2021, 7, 21), (2020, 1, 14), (2021, 7, 30), (2021, 7, 20), (2021, 2, 9)}
----------------------test--------------------------
{(2022, 5, 13), (2022, 7, 10), (2022, 7, 13), (2022, 5, 28), (2022, 7, 19), (2022, 5, 22), (2022, 5, 25), (2022, 5, 31), (2022, 5, 12), (2022, 7, 9), (2022, 5, 9), (2022, 7, 6), (2022, 5, 21), (2022, 7, 12), (2022, 5, 24), (2022, 7, 21), (2022, 5, 27), (2022, 7, 18), (2022, 5, 30), (2022, 7, 5), (2022, 5, 11), (2022, 7, 8), (2022, 7, 14), (2022, 5, 20), (2022, 5, 26), (2022, 5, 23), (2022, 7, 20), (2022, 5, 29), (2022, 5, 10), (2022, 7, 7)}


In [9]:
#remove the day which have 3 more missing data in train datafram
for i in date_t:
  train_df = train_df[~((train_df['Year'] == i[0]) & (train_df['Month'] == i[1]) & (train_df['Day'] == i[2]))]
train_df.dropna(how='all')
train_df = train_df.reset_index(drop=True)
#remove the day which have 3 more missing data in valid datafram
for i in date_v:
  valid_df = valid_df[~((valid_df['Year'] == i[0]) & (valid_df['Month'] == i[1]) & (valid_df['Day'] == i[2]))]
valid_df.dropna(how='all')
valid_df = valid_df.reset_index(drop=True)
#remove the day which have 3 more missing data in test datafram
for i in date_s:
  test_df = test_df[~((test_df['Year'] == i[0]) & (test_df['Month'] == i[1]) & (test_df['Day'] == i[2]))]
test_df.dropna(how='all')
test_df = test_df.reset_index(drop=True)

In [24]:
#replace the missing data with the average of last and next hour
flab = ['Temp Flag', 'Dew Point Temp Flag', 'Rel Hum Flag', 'Precip. Amount Flag', 'Wind Spd Flag', 'Stn Press Flag', 'Visibility Flag']
vlab = ['Temp (°C)', "Dew Point Temp (°C)", "Rel Hum (%)", "Precip. Amount (mm)", "Wind Spd (km/h)", "Stn Press (kPa)", "Visibility (km)"]
idx = 0
for i in flab:
  rows = train_df.loc[train_df[i] == 'M']
  for index, row in rows.iterrows():
    train_df.loc[index, vlab[idx]] = (
            ((train_df.loc[index - 1, vlab[idx]]) + train_df.loc[index + 1, vlab[idx]]) / 2)
  idx += 1
#replace the missing data in valid set
idx = 0
for i in flab:
  rows = valid_df.loc[valid_df[i] == 'M']
  for index, row in rows.iterrows():
    valid_df.loc[index, vlab[idx]] = (
            ((valid_df.loc[index - 1, vlab[idx]]) + valid_df.loc[index + 1, vlab[idx]]) / 2)
  idx += 1
#replace the missing data in test set
idx = 0
for i in flab:
  rows = test_df.loc[test_df[i] == 'M']
  for index, row in rows.iterrows():
    test_df.loc[index, vlab[idx]] = (
            ((test_df.loc[index - 1, vlab[idx]]) + test_df.loc[index + 1, vlab[idx]]) / 2)
  idx += 1

In [35]:
# chose the train data that need to used
labx = ["Temp (°C)", "Dew Point Temp (°C)", "Rel Hum (%)", "Precip. Amount (mm)", "Wind Spd (km/h)", "Stn Press (kPa)", "Visibility (km)"]
temp = pd.DataFrame(columns=labx)
for j in labx:
  temp[j] = train_df[j]
train_data = temp
# chose the valid data that need to used
temp = pd.DataFrame(columns=labx)
for j in labx:
  temp[j] = valid_df[j]
valid_data = temp
# chose the test data that need to used
temp = pd.DataFrame(columns=labx)
for j in labx:
  temp[j] = test_df[j]
test_data = temp

# drop the row that contain None
train_data = train_data.dropna()
valid_data = valid_data.dropna()
test_data = test_data.dropna()

In [33]:
#Change weather name from string to number
lab_weather = ["Temp (°C)", "Dew Point Temp (°C)", "Rel Hum (%)", "Precip. Amount (mm)", "Wind Spd (km/h)", "Stn Press (kPa)", "Visibility (km)", "Weather"]
temp1 = pd.DataFrame(columns=lab_weather)
for j in lab_weather:
    temp1[j] = train_df[j]
unique_count = temp1["Weather"].nunique()
print("Number of unique values:", unique_count)
unique_values = temp1["Weather"].unique()
print("Unique values:", unique_values)
temp1["Weather"] = temp1["Weather"].fillna(0)
weather = []
for i in unique_values:
    weather.append(i)
weather.pop(0)
replacement_dict = {
    'Snow':1,
    'Rain,Fog':2,
    'Fog':3,
    'Haze':4,
    'Rain':5,
    'Freezing Rain,Fog':6,
    'Freezing Rain,Snow':7,
    'Snow,Blowing Snow':8,
    'Thunderstorms,Rain,Fog':9,
    'Thunderstorms,Rain':10,
    'Thunderstorms,Heavy Rain,Fog':11,
    'Moderate Rain,Fog':12,
    'Heavy Rain':13,
    'Thunderstorms':14,
    'Heavy Rain,Fog':15,
    'Moderate Rain':16,
    'Rain,Snow':17,
    'Freezing Rain':18,
    'Thunderstorms,Fog':19,
    'Thunderstorms,Moderate Rain':20,
    'Thunderstorms,Haze':21,
    'Thunderstorms,Heavy Rain':22,
    'Moderate Snow':23,
    'Heavy Snow':24,
    'Thunderstorms,Moderate Rain,Fog':25,
    'Haze,Blowing Snow':26
}
temp2 = pd.DataFrame(columns=lab_weather)
for j in lab_weather:
    temp2[j] = valid_df[j]
unique_count1 = temp2["Weather"].nunique()
print("Number of unique values:", unique_count1)
unique_values1 = temp2["Weather"].unique()
print("Unique values:", unique_values1)
temp2["Weather"] = temp2["Weather"].fillna(0)
weather1 = []
for i in unique_values1:
    weather1.append(i)
weather1.pop(0)
print(len(list(set(weather) & set(weather1))))

temp3 = pd.DataFrame(columns=lab_weather)
for j in lab_weather:
    temp3[j] = test_df[j]
unique_count2 = temp3["Weather"].nunique()
print("Number of unique values:", unique_count2)
unique_values2 = temp3["Weather"].unique()
print("Unique values:", unique_values2)
temp3["Weather"] = temp3["Weather"].fillna(0)
weather2 = []
for i in unique_values2:
    weather2.append(i)
weather2.pop(0)
print(len(list(set(weather) & set(weather2))))


temp1["Weather"] = temp1["Weather"].replace(replacement_dict)
temp2["Weather"] = temp2["Weather"].replace(replacement_dict)
temp3["Weather"] = temp3["Weather"].replace(replacement_dict)
train_data1 = temp1
valid_data1 = temp2
test_data1 = temp3

train_data1 = train_data1.dropna()
valid_data1 = valid_data1.dropna()
test_data1 = test_data1.dropna()

train_data1.to_csv('train_data1.csv', index=False)
valid_data1.to_csv('valid_data1.csv', index=False)
test_data1.to_csv('test_data1.csv', index=False)

Number of unique values: 26
Unique values: [nan 'Snow' 'Rain,Fog' 'Fog' 'Haze' 'Rain' 'Freezing Rain,Fog'
 'Freezing Rain,Snow' 'Snow,Blowing Snow' 'Thunderstorms,Rain,Fog'
 'Thunderstorms,Rain' 'Thunderstorms,Heavy Rain,Fog' 'Moderate Rain,Fog'
 'Heavy Rain' 'Thunderstorms' 'Heavy Rain,Fog' 'Moderate Rain' 'Rain,Snow'
 'Freezing Rain' 'Thunderstorms,Fog' 'Thunderstorms,Moderate Rain'
 'Thunderstorms,Haze' 'Thunderstorms,Heavy Rain' 'Moderate Snow'
 'Heavy Snow' 'Thunderstorms,Moderate Rain,Fog' 'Haze,Blowing Snow']
Number of unique values: 20
Unique values: [nan 'Snow' 'Rain' 'Rain,Snow' 'Fog' 'Rain,Fog' 'Heavy Rain,Fog'
 'Moderate Rain,Fog' 'Moderate Rain' 'Freezing Rain,Snow' 'Haze'
 'Thunderstorms,Moderate Rain' 'Thunderstorms,Heavy Rain' 'Thunderstorms'
 'Thunderstorms,Rain,Fog' 'Thunderstorms,Rain' 'Thunderstorms,Fog'
 'Moderate Snow' 'Heavy Snow' 'Thunderstorms,Moderate Rain,Fog'
 'Thunderstorms,Heavy Rain,Fog']
20
Number of unique values: 16
Unique values: ['Fog' nan 'Rain,Fog'