# Data Loading & Preprocessing
***

In [4]:
import numpy as np
import pandas as pd
import xarray as xr


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import ShuffleSplit
from sklearn import metrics

<br>

### a) Loading & Normalization

**pred:** contains the data used for the inputs  <br>
**label:** from Sahelrainfall data serves as validation data

In [5]:
pred = xr.open_dataset('data/da_pred_all.nc').to_dataframe()

pred_unit = pd.DataFrame(
    data = StandardScaler().fit_transform(pred), 
    columns = pred.columns,
    index =  pred.index
)


# load validatoin data (Summer Rainfall over Sahel) 
labels = np.mean(np.loadtxt("data/da_o_sahelprecip19012017.txt", skiprows=8,)[:,7:10] * 0.01,  axis=1)


In [9]:
print(np.std(pred_unit))
print(np.mean(pred_unit))

siod_e        1.0
siod_w        1.0
sst_med       1.0
tsa           1.0
tna           1.0
sst_mdr       1.0
sata_lnh      1.0
sata_lsh      1.0
sata_onh      1.0
sata_osh      1.0
slp_darwin    1.0
slp_tahiti    1.0
amo           1.0
nao           1.0
pdo           1.0
np            1.0
nino12        1.0
nino3         1.0
nino34        1.0
nino4         1.0
dtype: float64
siod_e        5.963890e-16
siod_w       -7.287618e-16
sst_med      -6.604404e-16
tsa           1.100734e-16
tna           1.966139e-15
sst_mdr      -1.685262e-15
sata_lnh     -4.554761e-17
sata_lsh      3.036507e-17
sata_onh     -2.656944e-17
sata_osh      1.005843e-16
slp_darwin    7.521524e-14
slp_tahiti   -6.993456e-15
amo          -5.503670e-17
nao          -3.510962e-17
pdo          -1.043799e-17
np           -3.759576e-14
nino12       -3.316435e-16
nino3        -1.274289e-14
nino34        1.280078e-15
nino4         4.605053e-15
dtype: float64


In [6]:
pred_unit.head()

Unnamed: 0_level_0,siod_e,siod_w,sst_med,tsa,tna,sst_mdr,sata_lnh,sata_lsh,sata_onh,sata_osh,slp_darwin,slp_tahiti,amo,nao,pdo,np,nino12,nino3,nino34,nino4
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1901,-1.100027,-1.152764,-0.74553,-0.595366,0.388372,0.608415,-0.123443,-0.732091,-0.497808,-0.737797,0.074807,1.634819,0.923204,0.917456,-0.193321,1.938388,-0.950168,-0.595561,-0.214314,-0.07927
1902,0.088643,0.340415,-1.507314,-0.954566,-0.346586,-0.173588,-1.289978,-0.20181,-1.175314,-0.987096,1.443896,2.682485,-0.620146,-1.17259,0.819716,-0.162154,0.991321,0.969845,1.099218,1.070532
1903,-0.900789,0.669332,-2.243639,-2.186294,-0.10197,0.283583,-1.333183,-1.076056,-1.415719,-1.333946,-0.071881,1.535042,-0.45829,-1.03041,-0.186187,0.530864,-0.371251,0.000784,0.524139,0.842095
1904,-0.949568,-1.056219,-0.079925,-1.975498,-2.214111,-1.894743,-1.135674,-1.133384,-1.863746,-1.778347,-0.903114,1.235708,-1.872482,1.447076,-0.892459,0.756497,-0.307712,-0.234313,-0.475713,-0.741738
1905,-0.03435,-0.632249,-0.718895,-1.684676,-1.334312,-1.014906,-1.314666,-0.595938,-1.284589,-0.954579,0.759351,-2.655622,-0.499163,-1.289888,0.545055,-0.326007,1.22783,1.497381,1.439037,1.032459


<br>

### b) PCA

### **Question:** normalize pcs again? --> See : [Data loading & preprocessing](https://keras.io/getting_started/intro_to_keras_for_engineers/#data-loading-amp-preprocessing) 
variance is non-unit for now...\
or does another normalization clear out the information of the pcs?
Badr did not mention anything like that.

In [10]:
# Scikit pca transformation
pca = PCA()
principalComponents = pca.fit_transform(pred_unit)


# Create Create Pandas DF from PCs
col = []
for i in range(1, 21):
    col.append(f'PC{i}')

pred_pc = pd.DataFrame(
    data = principalComponents,
    columns = col,
    index =  pred.index
)


pred_pc = pd.DataFrame(
    data = StandardScaler().fit_transform(pred_pc), 
    columns = pred_pc.columns,
    index =  pred_pc.index
)

# Test for unit-variance and zero mean:
# np.std(pred_pc)
# np.mean(pred_pc)
pred_pc.head()

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1901,-0.559909,-0.428988,-1.229027,-1.123983,1.443562,-0.639119,-2.027823,0.286405,0.338136,1.205316,0.146156,-1.320392,-0.192548,1.936219,-1.308915,0.245816,-0.056096,0.894325,1.415203,0.141363
1902,-0.186229,1.465476,-0.286784,1.955787,2.317371,0.267679,-2.265856,-0.297191,0.249276,1.134709,-0.201523,0.906435,-0.847902,0.524257,2.052709,-0.626695,-0.218014,0.01864,0.155975,-0.257408
1903,-0.841694,0.939467,-0.937924,1.243579,1.248253,-1.379562,-2.158236,0.566613,-0.578406,1.315825,-2.446049,1.118266,-0.463878,0.605398,0.508777,0.550893,-0.280664,0.183667,0.963069,0.090541
1904,-1.845083,0.338553,0.828001,-0.690346,1.324063,1.099307,-0.505069,-0.892902,-0.850675,-0.416318,-0.958225,0.346329,-1.07565,0.598971,0.011722,-1.387453,0.651789,-0.363297,1.812307,0.83125
1905,-0.457849,2.306684,0.502853,0.393567,-0.922199,-1.544284,1.30542,-1.530165,0.81534,-0.566052,-0.512668,-0.472273,0.436924,-0.458513,0.988631,-1.166436,1.427537,-0.265735,1.244387,1.318074


In [11]:
print(np.std(pred_pc))
print(np.mean(pred_pc))

PC1     1.0
PC2     1.0
PC3     1.0
PC4     1.0
PC5     1.0
PC6     1.0
PC7     1.0
PC8     1.0
PC9     1.0
PC10    1.0
PC11    1.0
PC12    1.0
PC13    1.0
PC14    1.0
PC15    1.0
PC16    1.0
PC17    1.0
PC18    1.0
PC19    1.0
PC20    1.0
dtype: float64
PC1     1.973730e-16
PC2     3.510962e-17
PC3     5.930679e-17
PC4    -2.846726e-18
PC5     4.744543e-19
PC6    -3.795634e-17
PC7     3.795634e-18
PC8    -2.324826e-17
PC9     6.642360e-17
PC10   -1.470808e-17
PC11   -2.656944e-17
PC12   -1.138690e-17
PC13    1.550872e-17
PC14   -1.518254e-17
PC15    1.138690e-17
PC16    1.423363e-17
PC17    2.182490e-17
PC18   -3.705562e-17
PC19   -1.613145e-17
PC20   -1.802926e-17
dtype: float64


<br>

### c) Select PCs for input and turn into numpy array

In [5]:
features = pred_pc.to_numpy() #.loc[:,['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC13']]
features.shape

(117, 20)