# Purposes

1) Use classification to predict silulation outcomes from input parameter

2) Use sensivity analysis and feature selection to determine causes of simulation crashes

# Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import requests
from numpy.linalg import eig
from sklearn.decomposition import PCA

print("Done")

Done


# Read Data

## Overview of the dataset
Column 1: Latin hypercube study ID (study 1 to study 3)

Column 2: simulation ID (run 1 to run 180)

Columns 3-20: values of 18 climate model parameters scaled in the interval [0, 1]

Column 21: simulation outcome (0 = failure, 1 = success)

In [None]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [None]:
url = "https://raw.githubusercontent.com/lephanthutra/Observational-Data-Analytics/main/pop_failures.dat"
data = pd.read_csv(url, sep='\s\s+|,', engine='python')
data



Unnamed: 0,Study,Run,vconst_corr,vconst_2,vconst_3,vconst_4,vconst_5,vconst_7,ah_corr,ah_bolus,...,efficiency_factor,tidal_mix_max,vertical_decay_scale,convect_corr,bckgrnd_vdc1,bckgrnd_vdc_ban,bckgrnd_vdc_eq,bckgrnd_vdc_psim,Prandtl,outcome
0,1,1,0.859036,0.927825,0.252866,0.298838,0.170521,0.735936,0.428325,0.567947,...,0.245675,0.104226,0.869091,0.997518,0.448620,0.307522,0.858310,0.796997,0.869893,0
1,1,2,0.606041,0.457728,0.359448,0.306957,0.843331,0.934851,0.444572,0.828015,...,0.616870,0.975786,0.914344,0.845247,0.864152,0.346713,0.356573,0.438447,0.512256,1
2,1,3,0.997600,0.373238,0.517399,0.504993,0.618903,0.605571,0.746225,0.195928,...,0.679355,0.803413,0.643995,0.718441,0.924775,0.315371,0.250642,0.285636,0.365858,1
3,1,4,0.783408,0.104055,0.197533,0.421837,0.742056,0.490828,0.005525,0.392123,...,0.471463,0.597879,0.761659,0.362751,0.912819,0.977971,0.845921,0.699431,0.475987,1
4,1,5,0.406250,0.513199,0.061812,0.635837,0.844798,0.441502,0.191926,0.487546,...,0.551543,0.743877,0.312349,0.650223,0.522261,0.043545,0.376660,0.280098,0.132283,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,3,176,0.657136,0.489375,0.133713,0.411950,0.087780,0.356289,0.480204,0.029678,...,0.280546,0.384117,0.885948,0.768482,0.459479,0.334482,0.573002,0.610183,0.737706,1
536,3,177,0.915894,0.842720,0.518947,0.090622,0.336981,0.893576,0.978703,0.674868,...,0.798108,0.353546,0.044796,0.990900,0.347027,0.512499,0.810549,0.593332,0.142565,0
537,3,178,0.478600,0.941185,0.769245,0.950776,0.189406,0.112743,0.745645,0.527096,...,0.193103,0.829563,0.101506,0.548878,0.381966,0.198811,0.867108,0.461632,0.652817,1
538,3,179,0.007793,0.779287,0.867468,0.704820,0.983282,0.420303,0.710612,0.174746,...,0.761134,0.436714,0.690132,0.825133,0.981656,0.113193,0.364799,0.201469,0.536535,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Study                 540 non-null    int64  
 1   Run                   540 non-null    int64  
 2   vconst_corr           540 non-null    float64
 3   vconst_2              540 non-null    float64
 4   vconst_3              540 non-null    float64
 5   vconst_4              540 non-null    float64
 6   vconst_5              540 non-null    float64
 7   vconst_7              540 non-null    float64
 8   ah_corr               540 non-null    float64
 9   ah_bolus              540 non-null    float64
 10  slm_corr              540 non-null    float64
 11  efficiency_factor     540 non-null    float64
 12  tidal_mix_max         540 non-null    float64
 13  vertical_decay_scale  540 non-null    float64
 14  convect_corr          540 non-null    float64
 15  bckgrnd_vdc1          5

## Take features

In [None]:
data = data.iloc[:, 2:20]
data

Unnamed: 0,vconst_corr,vconst_2,vconst_3,vconst_4,vconst_5,vconst_7,ah_corr,ah_bolus,slm_corr,efficiency_factor,tidal_mix_max,vertical_decay_scale,convect_corr,bckgrnd_vdc1,bckgrnd_vdc_ban,bckgrnd_vdc_eq,bckgrnd_vdc_psim,Prandtl
0,0.859036,0.927825,0.252866,0.298838,0.170521,0.735936,0.428325,0.567947,0.474370,0.245675,0.104226,0.869091,0.997518,0.448620,0.307522,0.858310,0.796997,0.869893
1,0.606041,0.457728,0.359448,0.306957,0.843331,0.934851,0.444572,0.828015,0.296618,0.616870,0.975786,0.914344,0.845247,0.864152,0.346713,0.356573,0.438447,0.512256
2,0.997600,0.373238,0.517399,0.504993,0.618903,0.605571,0.746225,0.195928,0.815667,0.679355,0.803413,0.643995,0.718441,0.924775,0.315371,0.250642,0.285636,0.365858
3,0.783408,0.104055,0.197533,0.421837,0.742056,0.490828,0.005525,0.392123,0.010015,0.471463,0.597879,0.761659,0.362751,0.912819,0.977971,0.845921,0.699431,0.475987
4,0.406250,0.513199,0.061812,0.635837,0.844798,0.441502,0.191926,0.487546,0.358534,0.551543,0.743877,0.312349,0.650223,0.522261,0.043545,0.376660,0.280098,0.132283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,0.657136,0.489375,0.133713,0.411950,0.087780,0.356289,0.480204,0.029678,0.400102,0.280546,0.384117,0.885948,0.768482,0.459479,0.334482,0.573002,0.610183,0.737706
536,0.915894,0.842720,0.518947,0.090622,0.336981,0.893576,0.978703,0.674868,0.263398,0.798108,0.353546,0.044796,0.990900,0.347027,0.512499,0.810549,0.593332,0.142565
537,0.478600,0.941185,0.769245,0.950776,0.189406,0.112743,0.745645,0.527096,0.870987,0.193103,0.829563,0.101506,0.548878,0.381966,0.198811,0.867108,0.461632,0.652817
538,0.007793,0.779287,0.867468,0.704820,0.983282,0.420303,0.710612,0.174746,0.267685,0.761134,0.436714,0.690132,0.825133,0.981656,0.113193,0.364799,0.201469,0.536535


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   vconst_corr           540 non-null    float64
 1   vconst_2              540 non-null    float64
 2   vconst_3              540 non-null    float64
 3   vconst_4              540 non-null    float64
 4   vconst_5              540 non-null    float64
 5   vconst_7              540 non-null    float64
 6   ah_corr               540 non-null    float64
 7   ah_bolus              540 non-null    float64
 8   slm_corr              540 non-null    float64
 9   efficiency_factor     540 non-null    float64
 10  tidal_mix_max         540 non-null    float64
 11  vertical_decay_scale  540 non-null    float64
 12  convect_corr          540 non-null    float64
 13  bckgrnd_vdc1          540 non-null    float64
 14  bckgrnd_vdc_ban       540 non-null    float64
 15  bckgrnd_vdc_eq        5

# Find the variances of all random variables

In [None]:
print(data.var())

vconst_corr             0.083486
vconst_2                0.083476
vconst_3                0.083560
vconst_4                0.083517
vconst_5                0.083421
vconst_7                0.083436
ah_corr                 0.083527
ah_bolus                0.083469
slm_corr                0.083440
efficiency_factor       0.083501
tidal_mix_max           0.083594
vertical_decay_scale    0.083529
convect_corr            0.083418
bckgrnd_vdc1            0.083491
bckgrnd_vdc_ban         0.083477
bckgrnd_vdc_eq          0.083413
bckgrnd_vdc_psim        0.083484
Prandtl                 0.083529
dtype: float64


# Find the covariance between pairs of random variables. 

In [None]:
cov_matrix = np.cov(data.T)
cov_matrix

array([[ 8.34857426e-02,  3.37182453e-04,  7.79370095e-04,
        -1.52759197e-03,  1.57560856e-03,  1.28822143e-04,
         3.10140305e-04, -1.06308268e-03,  1.95004524e-04,
         8.86476013e-04, -1.18672254e-03, -7.50870304e-04,
        -2.48727601e-04, -1.78044114e-04, -1.75200201e-04,
         1.33292076e-03, -1.38844336e-03, -1.22475126e-04],
       [ 3.37182453e-04,  8.34760543e-02, -3.81230709e-05,
        -5.13067098e-05, -6.91929682e-04, -2.03457607e-03,
        -4.32713144e-04,  3.48841434e-04, -1.15670304e-03,
        -9.24372682e-04,  1.64614996e-03,  1.35518820e-04,
         2.17668253e-04, -1.22858509e-03,  3.66116880e-04,
         5.00547769e-04,  3.50800753e-04,  7.63300552e-04],
       [ 7.79370095e-04, -3.81230709e-05,  8.35596343e-02,
         8.26966510e-04,  5.25050823e-04, -1.32474043e-04,
         1.66592117e-03,  3.67613813e-04, -6.42559768e-04,
         5.93066538e-04, -7.87998398e-04, -2.06372468e-03,
        -1.72298390e-03, -3.56164290e-04, -4.35164520e

# Find the correlation between pairs of random variables. 

In [None]:
corr_matrix = data.corr()
corr_matrix

Unnamed: 0,vconst_corr,vconst_2,vconst_3,vconst_4,vconst_5,vconst_7,ah_corr,ah_bolus,slm_corr,efficiency_factor,tidal_mix_max,vertical_decay_scale,convect_corr,bckgrnd_vdc1,bckgrnd_vdc_ban,bckgrnd_vdc_eq,bckgrnd_vdc_psim,Prandtl
vconst_corr,1.0,0.004039,0.009331,-0.018294,0.01888,0.001544,0.003714,-0.012735,0.002336,0.010617,-0.014205,-0.008992,-0.00298,-0.002133,-0.002099,0.015973,-0.016631,-0.001467
vconst_2,0.004039,1.0,-0.000456,-0.000614,-0.008292,-0.024379,-0.005182,0.004179,-0.01386,-0.011072,0.019706,0.001623,0.002608,-0.014716,0.004386,0.005999,0.004202,0.009141
vconst_3,0.009331,-0.000456,1.0,0.009899,0.006289,-0.001587,0.019941,0.004402,-0.007695,0.0071,-0.009428,-0.024702,-0.020637,-0.004264,-0.00521,-0.000559,0.004771,-0.001334
vconst_4,-0.018294,-0.000614,0.009899,1.0,0.020504,0.021931,0.001805,-0.002334,-0.001731,-0.004753,0.01832,-0.010004,-0.006762,0.020442,-0.00108,-0.009262,-0.017147,0.005053
vconst_5,0.01888,-0.008292,0.006289,0.020504,1.0,0.005887,-0.003047,0.012453,0.003634,0.001077,0.021354,-0.016312,0.02138,0.009894,-0.019179,-0.020752,-0.009324,0.012265
vconst_7,0.001544,-0.024379,-0.001587,0.021931,0.005887,1.0,-0.01677,-0.021644,0.001244,0.015121,7.5e-05,0.015288,0.007036,-0.003641,-0.007897,-0.006576,0.013203,0.008412
ah_corr,0.003714,-0.005182,0.019941,0.001805,-0.003047,-0.01677,1.0,-0.035498,-0.005119,0.009604,-0.006832,0.016503,0.002921,0.012447,-0.003368,0.007051,0.002443,-0.002381
ah_bolus,-0.012735,0.004179,0.004402,-0.002334,0.012453,-0.021644,-0.035498,1.0,-0.009403,0.01226,0.012005,-0.003947,-0.019307,-0.010642,0.004866,0.032398,0.000259,0.007055
slm_corr,0.002336,-0.01386,-0.007695,-0.001731,0.003634,0.001244,-0.005119,-0.009403,1.0,0.00876,0.002575,0.002272,0.002633,-0.003043,0.006023,-0.008447,-0.002301,0.014281
efficiency_factor,0.010617,-0.011072,0.0071,-0.004753,0.001077,0.015121,0.009604,0.01226,0.00876,1.0,-0.017926,0.018009,0.011925,-0.034026,0.003393,0.009925,-0.005241,-0.004465


# Make a scatter plot of the data and see relationships between the plots and the covariances and correlations. 


=> We realized that all features are important

In [None]:
sns.pairplot(data)

# Extract important features. 

## Find eigenvalues and eigenvectors of the covariance matrix. 

Why we need to use eigenvalues and eigenvectors ?

How to apply this metric to extract important features?

In [None]:
eigenvalues, eigenvectors = eig(cov_matrix)

In [None]:
print("Eigenvalues:\n",eigenvalues)

Eigenvalues:
 [0.09010916 0.08984972 0.07697012 0.07727164 0.07820514 0.07893949
 0.07970403 0.08819722 0.08748292 0.0810872  0.0816109  0.08657826
 0.08332811 0.08382111 0.08537507 0.08505451 0.08449341 0.08468789]


In [None]:
print("Eigenvectors:\n",eigenvectors)

Eigenvectors:
 [[ 2.02647015e-02  1.80957021e-01  1.42492976e-01 -4.20526304e-01
   9.61757372e-02  7.03147177e-02 -7.69913514e-03  2.28728929e-01
   3.60310933e-01  2.20400909e-02 -5.71123947e-02  2.00143721e-01
   5.53031727e-01  2.60918704e-01 -2.55769227e-01  1.58491641e-01
   1.78241038e-01  1.82395315e-01]
 [ 2.21376253e-01 -1.86863396e-01  1.59647045e-01  2.18683799e-01
  -1.09408985e-02 -3.35529079e-01 -2.19419953e-01 -1.53478459e-01
  -7.53706969e-02 -2.78099602e-01  2.36815868e-01  3.89230038e-01
   3.06617763e-01 -2.20022696e-01 -1.86199080e-01 -1.34769897e-01
   2.59444563e-01 -3.24335892e-01]
 [ 4.96022969e-02 -7.47453244e-02 -1.45802852e-01  3.57357697e-04
  -8.85505216e-02  2.34815798e-01 -5.48919233e-01  4.25602429e-01
   3.39590440e-01  1.85243866e-01  1.01212864e-01 -7.30327174e-02
  -9.16862256e-02 -1.25719045e-01 -5.75145384e-02 -4.49494403e-01
  -7.50005564e-02 -1.57670857e-01]
 [-3.44538619e-01 -2.98780583e-01 -4.23895796e-02 -3.02742682e-01
   3.29390022e-01  1.2

### Check if whether we calculate eigenvectors and eigenvalues right or not!!!

If LHS = RHS => accurate results

In [None]:
#
# LHS
#
print(cov_matrix.dot(eigenvectors[:, 0]))


[ 0.00182604  0.01994803  0.00446962 -0.03104608 -0.03551357 -0.02802545
 -0.00547792  0.03327667 -0.01225321  0.00631157 -0.01088107  0.00097184
 -0.02675952 -0.01361222  0.0188947   0.04085926  0.01769812 -0.00043035]


In [None]:
#
# RHS
#
print(eigenvalues[0]*eigenvectors[:, 0])

[ 0.00182604  0.01994803  0.00446962 -0.03104608 -0.03551357 -0.02802545
 -0.00547792  0.03327667 -0.01225321  0.00631157 -0.01088107  0.00097184
 -0.02675952 -0.01361222  0.0188947   0.04085926  0.01769812 -0.00043035]


## Apply PCA to transform the data matrix.

In [None]:
# feature extraction
pca = PCA(n_components=9)
fit = pca.fit(data)

In [None]:
a = fit.explained_variance_ratio_
a

array([0.05996221, 0.05978956, 0.05868992, 0.05821461, 0.05761261,
       0.05681196, 0.05659864, 0.05635468, 0.05622527])

In [None]:
b = fit.components_
b

array([[-0.0202647 , -0.22137625, -0.0496023 ,  0.34453862,  0.39411726,
         0.31101666,  0.06079206, -0.36929289,  0.13598183, -0.07004365,
         0.12075431, -0.01078512,  0.29696778,  0.1510637 , -0.20968677,
        -0.45344182, -0.19640756,  0.00477587],
       [-0.18095702,  0.1868634 ,  0.07474532,  0.29878058,  0.19221507,
        -0.18253796, -0.22812477,  0.26012361, -0.11217554, -0.43650164,
         0.40688867, -0.38378717, -0.21594639,  0.2533088 ,  0.12636245,
        -0.04897788,  0.0463362 ,  0.01511392],
       [ 0.22872893, -0.15347846,  0.42560243,  0.05296198, -0.0423085 ,
        -0.1234507 ,  0.4474616 , -0.2677057 , -0.15609099, -0.15729062,
        -0.30351916, -0.24605002, -0.23567003,  0.37851784, -0.1376637 ,
         0.13671551,  0.03469471, -0.1087899 ],
       [ 0.36031093, -0.0753707 ,  0.33959044,  0.06630907,  0.41816582,
         0.03271364, -0.20517648,  0.38654112,  0.00506655,  0.3488494 ,
        -0.07757337, -0.22999969, -0.08577611, -0.261

In [None]:
# summarize components
print("Explained Variance:\n", fit.explained_variance_ratio_)
print("\n",fit.components_)

Explained Variance:
 [0.05996221 0.05978956 0.05868992 0.05821461 0.05761261 0.05681196
 0.05659864 0.05635468 0.05622527]

 [[-0.0202647  -0.22137625 -0.0496023   0.34453862  0.39411726  0.31101666
   0.06079206 -0.36929289  0.13598183 -0.07004365  0.12075431 -0.01078512
   0.29696778  0.1510637  -0.20968677 -0.45344182 -0.19640756  0.00477587]
 [-0.18095702  0.1868634   0.07474532  0.29878058  0.19221507 -0.18253796
  -0.22812477  0.26012361 -0.11217554 -0.43650164  0.40688867 -0.38378717
  -0.21594639  0.2533088   0.12636245 -0.04897788  0.0463362   0.01511392]
 [ 0.22872893 -0.15347846  0.42560243  0.05296198 -0.0423085  -0.1234507
   0.4474616  -0.2677057  -0.15609099 -0.15729062 -0.30351916 -0.24605002
  -0.23567003  0.37851784 -0.1376637   0.13671551  0.03469471 -0.1087899 ]
 [ 0.36031093 -0.0753707   0.33959044  0.06630907  0.41816582  0.03271364
  -0.20517648  0.38654112  0.00506655  0.3488494  -0.07757337 -0.22999969
  -0.08577611 -0.26141845 -0.23519765  0.07913176 -0.238098

quantify and predict the probability of failure