In [1]:
!pip install utils

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install eofs

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import os
import datetime as dt  # Python standard library datetime  module
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import netCDF4 as nc
import xarray as xr
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import RandomizedSearchCV
from eofs.xarray import Eof
import utils
# from utils import data_path

In [12]:
#getting X and Y for training data
X = xr.concat([xr.open_dataset('inputs_historical.nc'), xr.open_dataset('inputs_ssp585.nc'),xr.open_dataset( 'inputs_ssp126.nc'),xr.open_dataset('inputs_ssp370.nc')], dim='time').compute()
Y = xr.concat([xr.open_dataset('outputs_historical.nc').mean(dim="member"), xr.open_dataset('outputs_ssp585.nc').mean(dim="member"),xr.open_dataset('outputs_ssp126.nc').mean(dim="member"),xr.open_dataset('outputs_ssp370.nc').mean(dim="member")], dim='time').compute()
# X["time"]=np.arange(1, 424 + 165 + 165)
# Y["time"]=np.arange(1, 424 + 165 + 165)

In [13]:
X

In [14]:
Y

In [15]:
#getting dataframe of co2
co2_df = X['CO2'].to_dataframe().reset_index()
co2_df['CO2'] = co2_df['CO2']/2400 #noramlize co2 by the max of 2400
co2_df = co2_df.set_index('time')
co2_df

Unnamed: 0_level_0,CO2
time,Unnamed: 1_level_1
1850,0.000078
1851,0.000157
1852,0.000239
1853,0.000325
1854,0.000425
...,...
2096,2.686027
2097,2.718787
2098,2.751748
2099,2.784911


In [16]:
#getting dataframe of ch4
ch4_df = X['CH4'].to_dataframe().reset_index()
ch4_df['CH4'] = ch4_df['CH4']/0.6 #normalize ch4 by the max of 0.6
ch4_df = ch4_df.set_index('time')
ch4_df

Unnamed: 0_level_0,CH4
time,Unnamed: 1_level_1
1850,0.052177
1851,0.052903
1852,0.053630
1853,0.054356
1854,0.055082
...,...
2096,1.221996
2097,1.228833
2098,1.235670
2099,1.242506


In [17]:
#create eof solver for black carbon
bc_solver = Eof(X['BC'])
bc_eofs = bc_solver.eofsAsCorrelation(neofs=5)
bc_pcs = bc_solver.pcs(npcs=5, pcscaling=1)

In [18]:
bc_pcs.to_dataframe().pivot_table(values='pcs',index=['time'],columns=['mode'])

mode,0,1,2,3,4
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1850,-1.070785,-0.107626,-0.571793,-1.272226,-0.051825
1851,-1.070892,-0.104134,-0.577851,-1.246206,-0.069739
1852,-1.073213,-0.117359,-0.553025,-1.242697,-0.067722
1853,-1.075500,-0.130609,-0.528296,-1.238282,-0.065884
1854,-1.087790,-0.214882,-0.368441,-1.335067,0.028653
...,...,...,...,...,...
2096,-0.175352,1.027209,-0.057526,-0.330666,-1.113688
2097,-0.190487,1.020000,-0.080205,-0.347073,-1.144254
2098,-0.205622,1.012791,-0.102884,-0.363480,-1.174820
2099,-0.220757,1.005581,-0.125563,-0.379887,-1.205386


In [19]:
#getting bc dataframe
bc_df = bc_pcs.to_dataframe()
pivot_bc = bc_df.pivot_table(values='pcs',index=['time'],columns=['mode'])
bc_df = pd.DataFrame(data = [pivot_bc[0].to_numpy(),pivot_bc[1].to_numpy(),pivot_bc[2].to_numpy(),pivot_bc[3].to_numpy(),pivot_bc[4].to_numpy()]).T
bc_df = bc_df.rename(columns = {0: 'BC_0', 1: 'BC_1',2:'BC_2',3:'BC_3',4:'BC_4'})
bc_df

Unnamed: 0,BC_0,BC_1,BC_2,BC_3,BC_4
0,-1.070785,-0.107626,-0.571793,-1.272226,-0.051825
1,-1.070892,-0.104134,-0.577851,-1.246206,-0.069739
2,-1.073213,-0.117359,-0.553025,-1.242697,-0.067722
3,-1.075500,-0.130609,-0.528296,-1.238282,-0.065884
4,-1.087790,-0.214882,-0.368441,-1.335067,0.028653
...,...,...,...,...,...
246,-0.175352,1.027209,-0.057526,-0.330666,-1.113688
247,-0.190487,1.020000,-0.080205,-0.347073,-1.144254
248,-0.205622,1.012791,-0.102884,-0.363480,-1.174820
249,-0.220757,1.005581,-0.125563,-0.379887,-1.205386


In [20]:
#create eof solver for so2
so2_solver = Eof(X['SO2'])
so2_eofs = so2_solver.eofsAsCorrelation(neofs=5)
so2_pcs = so2_solver.pcs(npcs=5, pcscaling=1)

In [21]:
so2_pcs.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,pcs
time,mode,Unnamed: 2_level_1
1850,0,-0.348054
1850,1,-1.280982
1850,2,-1.068731
1850,3,-0.392418
1850,4,-0.362433
...,...,...
2100,0,0.805325
2100,1,0.103484
2100,2,-0.026458
2100,3,-0.501266


In [22]:
#getting so2 dataframe
so2_df = so2_pcs.to_dataframe()
pivot_so2 = so2_df.pivot_table(values='pcs',index=['time'],columns=['mode'])
so2_df = pd.DataFrame(data =                       [pivot_so2[0].to_numpy(),pivot_so2[1].to_numpy(),pivot_so2[2].to_numpy(),pivot_so2[3].to_numpy(),pivot_so2[4].to_numpy()]).T
so2_df = so2_df.rename(columns = {0: 'SO2_0', 1: 'SO2_1',2:'SO2_2',3:'SO2_3',4:'SO2_4'})
so2_df

Unnamed: 0,SO2_0,SO2_1,SO2_2,SO2_3,SO2_4
0,-0.348054,-1.280982,-1.068731,-0.392418,-0.362433
1,-0.344520,-1.280697,-1.061362,-0.404348,-0.379437
2,-0.346847,-1.278414,-1.057419,-0.402204,-0.379591
3,-0.348326,-1.276769,-1.054936,-0.401100,-0.380175
4,-0.375174,-1.265988,-1.070907,-0.339490,-0.304372
...,...,...,...,...,...
246,0.241937,-0.629333,-0.281001,-0.499295,-0.653799
247,0.231459,-0.640898,-0.313820,-0.519133,-0.646353
248,0.220982,-0.652464,-0.346640,-0.538970,-0.638907
249,0.210504,-0.664029,-0.379459,-0.558808,-0.631460


In [23]:
#concat co2 and ch4 data
historical_input = pd.concat([co2_df,ch4_df], axis=1).reset_index()
historical_input

Unnamed: 0,time,CO2,CH4
0,1850,0.000078,0.052177
1,1851,0.000157,0.052903
2,1852,0.000239,0.053630
3,1853,0.000325,0.054356
4,1854,0.000425,0.055082
...,...,...,...
418,2096,2.686027,1.221996
419,2097,2.718787,1.228833
420,2098,2.751748,1.235670
421,2099,2.784911,1.242506


In [24]:
#concat all X variables
X_input = pd.concat([historical_input,so2_df,bc_df],axis=1)
X_input = X_input.drop(columns = 'time')
X_input

Unnamed: 0,CO2,CH4,SO2_0,SO2_1,SO2_2,SO2_3,SO2_4,BC_0,BC_1,BC_2,BC_3,BC_4
0,0.000078,0.052177,-0.348054,-1.280982,-1.068731,-0.392418,-0.362433,-1.070785,-0.107626,-0.571793,-1.272226,-0.051825
1,0.000157,0.052903,-0.344520,-1.280697,-1.061362,-0.404348,-0.379437,-1.070892,-0.104134,-0.577851,-1.246206,-0.069739
2,0.000239,0.053630,-0.346847,-1.278414,-1.057419,-0.402204,-0.379591,-1.073213,-0.117359,-0.553025,-1.242697,-0.067722
3,0.000325,0.054356,-0.348326,-1.276769,-1.054936,-0.401100,-0.380175,-1.075500,-0.130609,-0.528296,-1.238282,-0.065884
4,0.000425,0.055082,-0.375174,-1.265988,-1.070907,-0.339490,-0.304372,-1.087790,-0.214882,-0.368441,-1.335067,0.028653
...,...,...,...,...,...,...,...,...,...,...,...,...
418,2.686027,1.221996,,,,,,,,,,
419,2.718787,1.228833,,,,,,,,,,
420,2.751748,1.235670,,,,,,,,,,
421,2.784911,1.242506,,,,,,,,,,


In [25]:
#convert precipitation values to mm/day
Y["pr"] *= 86400
Y["pr90"] *= 86400

In [None]:
#stack y 
y_tas=Y["tas"].stack(dim=["lat", "lon"])
y_pr=Y["pr"].stack(dim=["lat", "lon"])
y_pr90=Y["pr90"].stack(dim=["lat", "lon"])
y_dtr=Y["diurnal_temperature_range"].stack(dim=["lat", "lon"])

In [None]:
#convert y variables into dataframes
df_y_input_tas = pd.DataFrame(y_tas.to_pandas()).reset_index().drop(columns='time')
df_y_input_pr = pd.DataFrame(y_pr.to_pandas()).reset_index().drop(columns='time')
df_y_input_pr90 = pd.DataFrame(y_pr90.to_pandas()).reset_index().drop(columns='time')
df_y_input_dtr = pd.DataFrame(y_dtr.to_pandas()).reset_index().drop(columns='time')

In [None]:
#concat x and y variables into one dataframe
Xy_train_tas= pd.concat([X_input, df_y_input_tas], axis=1)
Xy_train_pr= pd.concat([X_input, df_y_input_pr], axis=1)
Xy_train_pr90 = pd.concat([X_input, df_y_input_pr90], axis=1)
Xy_train_dtr = pd.concat([X_input, df_y_input_dtr], axis=1)

In [None]:
# convert training data by variables into csv for future use
# Xy_train_tas.to_csv('Xy_train_tas.csv')
# Xy_train_pr.to_csv('Xy_train_pr.csv')
# Xy_train_pr90.to_csv('Xy_train_pr90.csv')
# Xy_train_dtr.to_csv('Xy_train_dtr.csv')

In [None]:
#separate training x and y
n_inp=X_input.shape[1]
n_iout=Xy_train_tas.shape[1]

X_train_tas=Xy_train_tas.iloc[:,0:n_inp]
y_train_tas=Xy_train_tas.iloc[:,n_inp:n_iout]

X_train_pr=Xy_train_pr.iloc[:,0:n_inp]
y_train_pr=Xy_train_pr.iloc[:,n_inp:n_iout]

X_train_pr90=Xy_train_pr90.iloc[:,0:n_inp]
y_train_pr90=Xy_train_pr90.iloc[:,n_inp:n_iout]

X_train_dtr=Xy_train_dtr.iloc[:,0:n_inp]
y_train_dtr=Xy_train_dtr.iloc[:,n_inp:n_iout]


In [None]:
#getting test data
test_Y = xr.open_dataset('outputs_ssp245.nc').compute()
test_X = xr.open_dataset('inputs_ssp245.nc').compute()

tas_truth = test_Y["tas"].mean('member')
pr_truth = test_Y["pr"].mean('member') * 86400
pr90_truth = test_Y["pr90"].mean('member') * 86400
dtr_truth = test_Y["diurnal_temperature_range"].mean('member')

In [None]:
#normalize test data
test_inputs = pd.DataFrame({
    "CO2": test_X["CO2"].data/2400,
    "CH4": test_X["CH4"].data/0.6
}, index=test_X["CO2"].coords['time'].data)


In [None]:
#concat test data into one dataframe
test_inputs=pd.concat([test_inputs, 
                       so2_solver.projectField(test_X["SO2"], neofs=5, eofscaling=1).to_dataframe().unstack('mode').rename(columns={i:f"SO2_{i}" for i in range(5)}),
                       bc_solver.projectField(test_X["BC"], neofs=5, eofscaling=1).to_dataframe().unstack('mode').rename(columns={i:f"BC_{i}" for i in range(5)})
                       ], axis=1)

In [None]:
test_inputs.columns = ['CO2',         'CH4',       'SO2_0',       'SO2_1',
             'SO2_2',       'SO2_3',       'SO2_4',        'BC_0',
              'BC_1',        'BC_2', 'BC_3','BC_4']

In [None]:
test_inputs

In [None]:
#convert test_data into csv for future use
# test_inputs.to_csv('test_data.csv')