In [15]:
import xarray as xr
import numpy  as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
import xgboost as xgb
import pickle

In [21]:
ff = pd.read_pickle("/glade/work/hpzhang/LCC_pred/data/data_monthly.pkl").dropna()
df  = ff.drop(columns=['lat', 'lon', 'lsm', 'T1000', 'T700', 'EIS', 'ECTEI','ELF','SST'])
df = df[df.index.year.isin(np.arange(2003,2019))]
len(df)

5401767

In [54]:
def data_prep(df_in, months):

    data = df_in[df_in.index.month.isin(months)]
    index_train = data.index.year.isin(np.arange(2003,2015))
    index_test  = data.index.year.isin(np.arange(2015,2019))
    data_train = data[index_train]
    data_test  = data[index_test]
    print(data_test.index)

    X_train, y_train = data_train.iloc[:,1:], data_train.iloc[:,0]
    X_test, y_test   = data_test.iloc[:,1:], data_test.iloc[:,0]

    from sklearn.preprocessing import StandardScaler
    scaler=StandardScaler()
    scaler.fit(X_train)

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    model = xgb.XGBRegressor(learning_rate=0.1,
                     max_depth=5,
                     n_estimators=200,
                     tree_method='gpu_hist', gpu_id=0)
    model.fit(X_train_scaled, y_train)

    ### save model ###
    #file_name = "/glade/work/hpzhang/LCC_pred/data/xgb_reg.pkl"
    #pickle.dump(model, open(file_name, "wb"))

    # Predicting the Training set results
    y_train_XGB = model.predict(X_train_scaled)
    mse_XGB = mean_squared_error(y_train, y_train_XGB)
    mae_XGB = mean_absolute_error(y_train, y_train_XGB)
    print('Predict training set')
    print('Mean squared error using Random Forest: ', mse_XGB)
    print('Mean absolute error Using Random Forest: ', mae_XGB)    
    
    # Predicting the Test set results
    y_pred_XGB = model.predict(X_test_scaled)
    mse_XGB = mean_squared_error(y_test, y_pred_XGB)
    mae_XGB = mean_absolute_error(y_test, y_pred_XGB)
    print('Predict Test set')
    print('Mean squared error using Random Forest: ', mse_XGB)
    print('Mean absolute error Using Random Forest: ', mae_XGB)

    #Feature ranking...
    feature_list = list(data.columns[1:])
    feature_imp_XGB = pd.Series(model.feature_importances_, index=feature_list).sort_values(ascending=False)
    print(feature_imp_XGB)
    
    return y_pred_XGB


DJF = [12, 1, 2]
MAM = [3, 4, 5]
JJA = [6, 7, 8]
SON = [9, 10, 11]


In [55]:
y_pred_XGB_DJF = data_prep(df, DJF)
y_pred_XGB_MAM = data_prep(df, MAM)
y_pred_XGB_JJA = data_prep(df, JJA)
y_pred_XGB_SON = data_prep(df, SON)

DatetimeIndex(['2015-01-15', '2015-01-15', '2015-01-15', '2015-01-15',
               '2015-01-15', '2015-01-15', '2015-01-15', '2015-01-15',
               '2015-01-15', '2015-01-15',
               ...
               '2018-12-15', '2018-12-15', '2018-12-15', '2018-12-15',
               '2018-12-15', '2018-12-15', '2018-12-15', '2018-12-15',
               '2018-12-15', '2018-12-15'],
              dtype='datetime64[ns]', name='time', length=340004, freq=None)
Predict training set
Mean squared error using Random Forest:  0.0037101444
Mean absolute error Using Random Forest:  0.047233995
Predict Test set
Mean squared error using Random Forest:  0.0040342365
Mean absolute error Using Random Forest:  0.048993822
TH1000      0.517228
LTS         0.121151
OMEGA500    0.086752
dQ          0.044499
RH850       0.033326
TH850       0.032944
Q1000       0.022319
U1000       0.020945
SH          0.018697
PWV         0.018201
RH1000      0.017270
Tadv        0.015001
Q700        0.013975
Q850  

In [56]:
### output ###

var = 'XGB_diff_season'
os.system("mkdir -p /glade/work/hpzhang/LCC_pred/data/{}".format(var))
np.save("/glade/work/hpzhang/LCC_pred/data/{}/y_pred_XGB_DJF.npy".format(var), np.array(y_pred_XGB_DJF))
np.save("/glade/work/hpzhang/LCC_pred/data/{}/y_pred_XGB_MAM.npy".format(var), np.array(y_pred_XGB_MAM))
np.save("/glade/work/hpzhang/LCC_pred/data/{}/y_pred_XGB_JJA.npy".format(var), np.array(y_pred_XGB_JJA))
np.save("/glade/work/hpzhang/LCC_pred/data/{}/y_pred_XGB_SON.npy".format(var), np.array(y_pred_XGB_SON))



In [57]:
len(y_pred_XGB_DJF)

340004