In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm  
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

In [2]:
base_dir = "/projects/dsci410_510/data_kclair/"

In [10]:
corn = pd.read_csv("data/_corn-prices-historical-chart-data.csv", skiprows=15)
corn = corn.rename(columns={'date': 'time', ' value': 'corn_price'})
corn.head()

Unnamed: 0,time,corn_price
0,1959-07-01,1.177
1,1959-07-02,1.176
2,1959-07-06,1.171
3,1959-07-07,1.171
4,1959-07-08,1.17


In [11]:
wheat = pd.read_csv("data/_wheat-prices-historical-chart-data.csv", skiprows=15)
wheat = wheat.rename(columns={'date': 'time', ' value': 'wheat_price'})
wheat.head()

Unnamed: 0,time,wheat_price
0,1959-07-01,1.964
1,1959-07-02,1.971
2,1959-07-06,1.97
3,1959-07-07,1.971
4,1959-07-08,1.964


In [12]:
corn['time'] = pd.to_datetime(corn['time'])
corn = corn[(corn['time'].dt.year >= 2000) & (corn['time'].dt.year <= 2010)]
corn.tail()

wheat['time'] = pd.to_datetime(wheat['time'])
wheat = wheat[(wheat['time'].dt.year >= 2000) & (wheat['time'].dt.year <= 2010)]
wheat.tail()

wc_merged = pd.merge(wheat, corn, on="time", how="inner")

In [13]:
max_day_temp = pd.read_csv("data/_max_day_temp.csv")
max_day_temp.head()
max_day_temp['time'] = pd.to_datetime(max_day_temp['time'])
max_day_temp['time_bounds'] = pd.to_datetime(max_day_temp['time_bounds'])


max_day_temp_filtered = max_day_temp[(max_day_temp['time'].dt.year >= 2000) & (max_day_temp['time'].dt.year <= 2010)]
max_day_temp_filtered.tail()

max_day_temp_filtered = max_day_temp_filtered.drop(columns=['time_bounds',"bounds"])
max_day_temp_filtered.tail()

max_day_temp_avg = max_day_temp_filtered.groupby(['time', 'lat', 'lon'], as_index=False)['TXx'].mean()
max_day_temp_avg = max_day_temp_avg.rename(columns = {"TXx": "max_day_temp"})
max_day_temp_avg

Unnamed: 0,time,lat,lon,max_day_temp
0,2000-01-05,36.75,-103.75,293.35178
1,2000-01-05,36.75,-103.25,295.86730
2,2000-01-05,36.75,-102.75,297.86926
3,2000-01-05,36.75,-102.25,298.21940
4,2000-01-05,36.75,-101.75,298.43073
...,...,...,...,...
465295,2010-12-25,48.75,-82.75,272.55080
465296,2010-12-25,48.75,-82.25,272.43840
465297,2010-12-25,48.75,-81.75,272.19614
465298,2010-12-25,48.75,-81.25,272.18515


In [14]:
bio_degree_days = pd.read_csv("data/_bio_degree_days.csv")
bio_degree_days.head()
bio_degree_days['time'] = pd.to_datetime(bio_degree_days['time'])
bio_degree_days['time_bounds'] = pd.to_datetime(bio_degree_days['time_bounds'])

bio_degree_days_filtered = bio_degree_days[(bio_degree_days['time'].dt.year >= 2000) & (bio_degree_days['time'].dt.year <= 2010)]
bio_degree_days_filtered.tail()

bio_degree_days_filtered = bio_degree_days_filtered.drop(columns=['bounds',"time_bounds", "threshold"])
bio_degree_days_filtered.tail()

bio_eff_days_avg = bio_degree_days_filtered.groupby(['time', 'lat', 'lon'], as_index=False)['BEDD'].mean()
bio_eff_days_avg = bio_eff_days_avg.rename(columns = {"BEDD":"effective_degree_days"})
bio_eff_days_avg

Unnamed: 0,time,lat,lon,effective_degree_days
0,2000-01-05,36.75,-103.75,6.794754
1,2000-01-05,36.75,-103.25,10.660324
2,2000-01-05,36.75,-102.75,12.471822
3,2000-01-05,36.75,-102.25,13.270727
4,2000-01-05,36.75,-101.75,14.187780
...,...,...,...,...
465295,2010-12-25,48.75,-82.75,0.000000
465296,2010-12-25,48.75,-82.25,0.000000
465297,2010-12-25,48.75,-81.75,0.000000
465298,2010-12-25,48.75,-81.25,0.000000


In [15]:
ice_days = pd.read_csv("data/_ice_days.csv")
ice_days.head()

ice_days['time'] = pd.to_datetime(ice_days['time'])
ice_days['time_bounds'] = pd.to_datetime(ice_days['time_bounds'])

ice_days_filtered = ice_days[(ice_days['time'].dt.year >= 2000) & (ice_days['time'].dt.year <= 2010)]
ice_days_filtered.tail()

ice_days_filtered = ice_days_filtered.drop(columns=['bounds',"time_bounds"])
ice_days_filtered.tail()

ice_days_avg = ice_days_filtered.groupby(['time', 'lat', 'lon'], as_index=False)['ID'].mean()
ice_days_avg =ice_days_avg.rename(columns = {"ID" : "ice_days"})
ice_days_avg

Unnamed: 0,time,lat,lon,ice_days
0,2000-01-05,36.75,-103.75,1.0
1,2000-01-05,36.75,-103.25,1.0
2,2000-01-05,36.75,-102.75,1.0
3,2000-01-05,36.75,-102.25,1.0
4,2000-01-05,36.75,-101.75,2.0
...,...,...,...,...
465295,2010-12-25,48.75,-82.75,11.0
465296,2010-12-25,48.75,-82.25,11.0
465297,2010-12-25,48.75,-81.75,11.0
465298,2010-12-25,48.75,-81.25,11.0


In [16]:
heavy_rain_days = pd.read_csv("data/_more_than_10mm_rain_days.csv")
heavy_rain_days.head()

heavy_rain_days['time'] = pd.to_datetime(heavy_rain_days['time'])
heavy_rain_days['time_bounds'] = pd.to_datetime(heavy_rain_days['time_bounds'])

heavy_rain_days_filtered = heavy_rain_days[(heavy_rain_days['time'].dt.year >= 2000) & (heavy_rain_days['time'].dt.year <= 2010)]
heavy_rain_days_filtered.tail()

heavy_rain_days_filtered = heavy_rain_days_filtered.drop(columns=['bounds',"time_bounds"])
heavy_rain_days_filtered.tail()

heavy_rain_days_avg = heavy_rain_days_filtered.groupby(['time', 'lat', 'lon'], as_index=False)['R10mm'].mean()
heavy_rain_days_avg =heavy_rain_days_avg.rename(columns = {"R10mm": "heavy_rain_days"})
heavy_rain_days_avg

Unnamed: 0,time,lat,lon,heavy_rain_days
0,2000-01-05,36.75,-103.75,0.0
1,2000-01-05,36.75,-103.25,0.0
2,2000-01-05,36.75,-102.75,0.0
3,2000-01-05,36.75,-102.25,0.0
4,2000-01-05,36.75,-101.75,0.0
...,...,...,...,...
465295,2010-12-25,48.75,-82.75,0.0
465296,2010-12-25,48.75,-82.25,0.0
465297,2010-12-25,48.75,-81.75,0.0
465298,2010-12-25,48.75,-81.25,0.0


In [17]:
merged_agriculture = max_day_temp_avg.merge(bio_eff_days_avg, on=['time', 'lat', 'lon'], how='outer')
merged_agriculture = merged_agriculture.merge(ice_days_avg, on=['time', 'lat', 'lon'], how='outer')
merged_agriculture = merged_agriculture.merge(heavy_rain_days_avg, on=['time', 'lat', 'lon'], how='outer')

merged_agriculture.head()

Unnamed: 0,time,lat,lon,max_day_temp,effective_degree_days,ice_days,heavy_rain_days
0,2000-01-05,36.75,-103.75,293.35178,6.794754,1.0,0.0
1,2000-01-05,36.75,-103.25,295.8673,10.660324,1.0,0.0
2,2000-01-05,36.75,-102.75,297.86926,12.471822,1.0,0.0
3,2000-01-05,36.75,-102.25,298.2194,13.270727,1.0,0.0
4,2000-01-05,36.75,-101.75,298.43073,14.18778,2.0,0.0


In [18]:
final_merged_data = wc_merged.merge(merged_agriculture, on=['time'], how='inner')

final_merged_data

Unnamed: 0,time,wheat_price,corn_price,lat,lon,max_day_temp,effective_degree_days,ice_days,heavy_rain_days
0,2000-01-05,2.4975,2.0300,36.75,-103.75,293.35178,6.794754,1.0,0.0
1,2000-01-05,2.4975,2.0300,36.75,-103.25,295.86730,10.660324,1.0,0.0
2,2000-01-05,2.4975,2.0300,36.75,-102.75,297.86926,12.471822,1.0,0.0
3,2000-01-05,2.4975,2.0300,36.75,-102.25,298.21940,13.270727,1.0,0.0
4,2000-01-05,2.4975,2.0300,36.75,-101.75,298.43073,14.187780,2.0,0.0
...,...,...,...,...,...,...,...,...,...
311370,2010-12-15,7.6475,5.8425,48.75,-82.75,274.35205,0.000000,9.0,0.0
311371,2010-12-15,7.6475,5.8425,48.75,-82.25,274.23660,0.000000,9.0,0.0
311372,2010-12-15,7.6475,5.8425,48.75,-81.75,274.08870,0.000000,9.0,0.0
311373,2010-12-15,7.6475,5.8425,48.75,-81.25,274.41687,0.000000,9.0,0.0


In [19]:
def create_sequences(data, seq_length, output_size):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length - output_size):  
        sequences.append(data[i:i+seq_length])
        targets.append(data[i+seq_length:i+seq_length+output_size]) 
    return np.array(sequences), np.array(targets)

scaler = MinMaxScaler(feature_range=(0, 1))
real_corn_scaled = scaler.fit_transform(corn["corn_price"].values.reshape(-1, 1))

seq_length = 30
output_size = 10  
X, y = create_sequences(real_corn_scaled, seq_length, output_size)

# I am splitting it this way because it is timeseries data
# Shuffling will ruin the dependencies I am trying to detect

train_size = int(len(X) * 0.8)
val_size = int(len(X) * 0.1)

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]


X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)
X_val, y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32)
X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)
