In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
path = "data/MERRA2_3dasm_temperature_50mb_48x24.csv"
fmt = "%Y-%m-%d"

dfT = pd.read_csv(path)
dfT = dfT[
    (dfT.lat >= -60) & 
    (dfT.lat <= 60) &
    (dfT.year >= 1986) & 
    (dfT.year < 1994)
]
dfT["date"] = dfT.date.apply(lambda x: datetime.strptime(x, fmt))
dfT.head()

Unnamed: 0,lon,lat,T,date,lev,month,year,day
83136,-176.5625,56.5,229.257093,1986-01-01,50,1,1986,0
83137,-169.0625,56.5,229.00878,1986-01-01,50,1,1986,0
83138,-161.5625,56.5,228.60915,1986-01-01,50,1,1986,0
83139,-154.0625,56.5,227.929692,1986-01-01,50,1,1986,0
83140,-146.5625,56.5,226.916581,1986-01-01,50,1,1986,0


In [3]:
dfT.tail()

Unnamed: 0,lon,lat,T,date,lev,month,year,day
193339,145.9375,-56.0,225.518711,1993-12-01,50,12,1993,0
193340,153.4375,-56.0,224.930258,1993-12-01,50,12,1993,0
193341,160.9375,-56.0,224.498621,1993-12-01,50,12,1993,0
193342,168.4375,-56.0,224.309027,1993-12-01,50,12,1993,0
193343,175.9375,-56.0,224.356599,1993-12-01,50,12,1993,0


In [4]:
path = "data/TOTEXTTAUall_48x24.csv"
dfAOD = pd.read_csv(path)

dfAOD = dfAOD[
    (dfAOD.lat >= -60) & 
    (dfAOD.lat <= 60) &
    (dfAOD.year >= 1986) & 
    (dfAOD.year < 1994)
]


dfAOD["date"] = dfAOD.date.apply(lambda x: datetime.strptime(x, fmt))

dfAOD.head()

Unnamed: 0,lon,lat,TOTEXTTAU,year,month,day,date
83136,-176.5625,56.5,0.125594,1986,1,1,1986-01-01
83137,-169.0625,56.5,0.111849,1986,1,1,1986-01-01
83138,-161.5625,56.5,0.099602,1986,1,1,1986-01-01
83139,-154.0625,56.5,0.100677,1986,1,1,1986-01-01
83140,-146.5625,56.5,0.135423,1986,1,1,1986-01-01


In [5]:
dfT.shape==dfAOD.shape

False

In [6]:
dfT.set_index(["lat", "lon", "date"], inplace=True)
df = dfAOD.join(dfT[["T"]], on=["lat", "lon", "date"])
df.head()

Unnamed: 0,lon,lat,TOTEXTTAU,year,month,day,date,T
83136,-176.5625,56.5,0.125594,1986,1,1,1986-01-01,229.257093
83137,-169.0625,56.5,0.111849,1986,1,1,1986-01-01,229.00878
83138,-161.5625,56.5,0.099602,1986,1,1,1986-01-01,228.60915
83139,-154.0625,56.5,0.100677,1986,1,1,1986-01-01,227.929692
83140,-146.5625,56.5,0.135423,1986,1,1,1986-01-01,226.916581


In [7]:
df.isna().any()

lon          False
lat          False
TOTEXTTAU    False
year         False
month        False
day          False
date         False
T            False
dtype: bool

In [8]:
month_offset = df.groupby("date").day.count().iloc[0]

for i in range(1,30):
    df[f"offset_{i}"] = df["T"].shift(periods=month_offset * i)
    df[f"aod_offset_{i}"] = df["TOTEXTTAU"].shift(periods=month_offset * i)
    
offset_cols = ["T"] + [
    f"offset_{i}" for i in range(1,30)
]

aod_offset_cols = offset_cols + ["TOTEXTTAU"] + [
    f"aod_offset_{i}" for i in range(1,30)
]

df = df[~df.offset_29.isna()].reset_index(drop=True)

In [9]:
df.isna().sum()

lon              0
lat              0
TOTEXTTAU        0
year             0
month            0
                ..
aod_offset_27    0
offset_28        0
aod_offset_28    0
offset_29        0
aod_offset_29    0
Length: 66, dtype: int64

In [10]:
df["Train_A"] = (df.date <= datetime.strptime("1991-01-01", fmt)) & ~(df.offset_29.isna())
df["Test_A"] = (df.date >= datetime.strptime("1991-01-01", fmt)) & ~(df.offset_29.isna())
df["Train_B"] = (df.date <= datetime.strptime("1991-12-01", fmt)) & ~(df.offset_29.isna())
df["Test_B"] = (df.date >= datetime.strptime("1991-12-01", fmt)) & ~(df.offset_29.isna())

In [11]:
df.Train_A.value_counts()

False    26880
True     24576
Name: Train_A, dtype: int64

In [12]:
df.Train_B.value_counts()

True     33024
False    18432
Name: Train_B, dtype: int64

In [13]:
df.Test_A.value_counts()

True     27648
False    23808
Name: Test_A, dtype: int64

In [14]:
df.Test_B.value_counts()

False    32256
True     19200
Name: Test_B, dtype: int64

In [16]:
df.to_csv("data/df_trains_ctx.csv", index=False)

In [18]:
# Rerun with surface temperature data

path = "data/MERRA2_tavg1_2d_slv_monthly_48x24.csv"
fmt = "%Y-%m-%d"

dfT = pd.read_csv(path)
dfT = dfT[
    (dfT.lat >= -60) & 
    (dfT.lat <= 60) &
    (dfT.year >= 1986) & 
    (dfT.year < 1994)
]

dfT["date"] = dfT.date.apply(lambda x: datetime.strptime(x, fmt))
dfT.head()

Unnamed: 0,lon,lat,T,date,month,year,day
192,-176.5625,56.5,268.824308,1986-01-01,1,1986,1
193,-169.0625,56.5,265.699616,1986-01-01,1,1986,1
194,-161.5625,56.5,270.925306,1986-01-01,1,1986,1
195,-154.0625,56.5,276.069343,1986-01-01,1,1986,1
196,-146.5625,56.5,277.743452,1986-01-01,1,1986,1


In [19]:
dfT.set_index(["lat", "lon", "date"], inplace=True)
df = dfAOD.join(dfT[["T"]], on=["lat", "lon", "date"])
df.head()

Unnamed: 0,lon,lat,TOTEXTTAU,year,month,day,date,T
83136,-176.5625,56.5,0.125594,1986,1,1,1986-01-01,268.824308
83137,-169.0625,56.5,0.111849,1986,1,1,1986-01-01,265.699616
83138,-161.5625,56.5,0.099602,1986,1,1,1986-01-01,270.925306
83139,-154.0625,56.5,0.100677,1986,1,1,1986-01-01,276.069343
83140,-146.5625,56.5,0.135423,1986,1,1,1986-01-01,277.743452


In [20]:
df.isna().any()

lon          False
lat          False
TOTEXTTAU    False
year         False
month        False
day          False
date         False
T            False
dtype: bool

In [21]:
month_offset = df.groupby("date").day.count().iloc[0]

for i in range(1,30):
    df[f"offset_{i}"] = df["T"].shift(periods=month_offset * i)
    df[f"aod_offset_{i}"] = df["TOTEXTTAU"].shift(periods=month_offset * i)
    
offset_cols = ["T"] + [
    f"offset_{i}" for i in range(1,30)
]

aod_offset_cols = offset_cols + ["TOTEXTTAU"] + [
    f"aod_offset_{i}" for i in range(1,30)
]

df = df[~df.offset_29.isna()].reset_index(drop=True)

In [22]:
df["Train_A"] = (df.date <= datetime.strptime("1991-01-01", fmt)) & ~(df.offset_29.isna())
df["Test_A"] = (df.date >= datetime.strptime("1991-01-01", fmt)) & ~(df.offset_29.isna())
df["Train_B"] = (df.date <= datetime.strptime("1991-12-01", fmt)) & ~(df.offset_29.isna())
df["Test_B"] = (df.date >= datetime.strptime("1991-12-01", fmt)) & ~(df.offset_29.isna())

In [23]:
df.to_csv("data/df_surface_trains_ctx.csv", index=False)