# PCA Custom MLLeafPipe

In [8]:
import pandas as pd

In [9]:
def get_features_from_simulations():
    """
    Reads the simulation parameter map
    
    
    Returns:
    - c3.Dataset X: set of features
    """
    import pandas as pd
    
    # fetch simulation parameters
    parameters = c3.SimulationModelParameters.fetch().objs
    parameters = parameters.toJson()
    df = pd.DataFrame(parameters)
    simulations = pd.DataFrame(df['id'])
    X = df[df.columns[5:]]
    
    
    # cast into c3 Datasets
    X = c3.Dataset.fromPython(pythonData=X)
    
    return X

In [10]:
ds = get_features_from_simulations()

In [11]:
# define technique (hyper parameters)
pca_technique = c3.PrincipalComponentAnalysisTechnique(nComponents=20)

# build pipe
pca_pipe = c3.PrincipalComponentAnalysisPipe(technique=pca_technique)

# train model
trained_pca = pca_pipe.train(input=ds)

In [12]:
# transform data
pca_ds = trained_pca.process(input=ds)
pca_ds = c3.Dataset.toPandas(dataset=pca_ds)
pca_ds

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.577729,-0.401115,0.396200,0.143843,0.186485,-0.059667,-0.492221,-0.178147,-0.348530,-0.131756,-0.193005,0.236349,0.085616,0.150702,0.284488,-0.024449,0.140391,0.146070,0.480833,0.088239
1,0.028757,-0.006830,0.021373,0.027038,-0.033451,-0.020538,-0.026111,0.006091,0.017431,0.042260,-0.021624,-0.006103,-0.033630,-0.005466,0.010637,0.003991,-0.000991,0.032794,0.001236,0.022484
2,-0.167081,0.279945,0.091416,0.306782,-0.210461,-0.437936,-0.164745,0.024634,-0.433461,0.478883,-0.222793,-0.251216,0.194517,-0.282956,-0.309872,-0.146254,0.450746,-0.443197,-0.747815,-0.443123
3,0.007985,0.236197,-0.163756,0.551707,-0.039802,-0.644095,-0.799279,0.034994,-0.061856,0.505336,-0.302650,0.588677,-0.031479,0.242692,-0.079870,0.047802,-0.049068,0.751324,0.203260,0.268843
4,-0.323451,-0.345466,-0.085678,0.319016,0.386521,0.160847,-0.536892,0.280681,0.026709,0.071952,0.335992,-0.753590,0.078402,0.091183,-0.175158,-0.124225,-0.536645,-0.097455,0.228396,0.020304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,-0.413250,0.091873,-0.178845,0.585575,0.706350,-0.750478,0.693568,0.214998,0.265329,-0.071015,-0.809572,-0.150165,-0.162903,0.128144,-0.217003,0.260750,-0.065818,0.203368,-0.092861,0.040512
217,0.761347,0.292965,-0.326160,-0.005250,0.036186,0.409310,0.286414,0.330012,-0.168501,0.167132,-0.093913,-0.049234,-0.589292,0.147854,0.391311,-0.436773,0.137121,-0.058063,0.511787,0.085223
218,-0.475350,-0.747525,0.497035,-0.111652,0.174455,-0.901829,-0.038252,-0.427437,-0.124865,-0.316776,0.348506,-0.124286,0.334559,0.092295,-0.266721,-0.039805,-0.041739,-0.575784,0.495440,0.159892
219,-0.277001,0.822553,-0.483951,0.415579,0.016724,0.081436,0.331014,-0.382131,0.774942,0.334005,-0.037689,0.388581,0.160879,-0.951359,0.132437,-0.383526,0.146264,0.481257,-0.211906,-0.174453


# Scikit-Learn

In [13]:
from sklearn.decomposition import PCA

In [14]:
#get data
ds = get_features_from_simulations()
ds = c3.Dataset.toPandas(dataset=ds)

In [15]:
# fit model
pca = PCA(n_components=20)
pca.fit(ds)

PCA(n_components=20)

In [16]:
# print singular values
pca.singular_values_

array([6.82007004, 6.62041065, 6.50005385, 6.30891814, 6.19404677,
       6.06812862, 5.95981408, 5.91592315, 5.84186373, 5.71189546,
       5.64046569, 5.53279817, 5.45154954, 5.40859582, 5.33537576,
       5.2859376 , 5.19641485, 5.12958128, 5.11144777, 4.95510099])

In [17]:
# transform data
pca_data = pca.transform(ds)
pca_data = pd.DataFrame(pca_data)
pca_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.577729,-0.401115,0.396200,0.143843,0.186485,-0.059667,-0.492221,-0.178147,-0.348530,-0.131756,-0.193005,0.236349,0.085616,0.150702,0.284488,-0.024449,0.140391,0.146070,0.480833,0.088239
1,0.028757,-0.006830,0.021373,0.027038,-0.033451,-0.020538,-0.026111,0.006091,0.017431,0.042260,-0.021624,-0.006103,-0.033630,-0.005466,0.010637,0.003991,-0.000991,0.032794,0.001236,0.022484
2,-0.167081,0.279945,0.091416,0.306782,-0.210461,-0.437936,-0.164745,0.024634,-0.433461,0.478883,-0.222793,-0.251216,0.194517,-0.282956,-0.309872,-0.146254,0.450746,-0.443197,-0.747815,-0.443123
3,0.007985,0.236197,-0.163756,0.551707,-0.039802,-0.644095,-0.799279,0.034994,-0.061856,0.505336,-0.302650,0.588677,-0.031479,0.242692,-0.079870,0.047802,-0.049068,0.751324,0.203260,0.268843
4,-0.323451,-0.345466,-0.085678,0.319016,0.386521,0.160847,-0.536892,0.280681,0.026709,0.071952,0.335992,-0.753590,0.078402,0.091183,-0.175158,-0.124225,-0.536645,-0.097455,0.228396,0.020304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,-0.413250,0.091873,-0.178845,0.585575,0.706350,-0.750478,0.693568,0.214998,0.265329,-0.071015,-0.809572,-0.150165,-0.162903,0.128144,-0.217003,0.260750,-0.065818,0.203368,-0.092861,0.040512
217,0.761347,0.292965,-0.326160,-0.005250,0.036186,0.409310,0.286414,0.330012,-0.168501,0.167132,-0.093913,-0.049234,-0.589292,0.147854,0.391311,-0.436773,0.137121,-0.058063,0.511787,0.085223
218,-0.475350,-0.747525,0.497035,-0.111652,0.174455,-0.901829,-0.038252,-0.427437,-0.124865,-0.316776,0.348506,-0.124286,0.334559,0.092295,-0.266721,-0.039805,-0.041739,-0.575784,0.495440,0.159892
219,-0.277001,0.822553,-0.483951,0.415579,0.016724,0.081436,0.331014,-0.382131,0.774942,0.334005,-0.037689,0.388581,0.160879,-0.951359,0.132437,-0.383526,0.146264,0.481257,-0.211906,-0.174453


# Comparing

In [18]:
diff = pca_data - pca_ds

In [19]:
diff.sum()

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
0     0.0
1     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
dtype: float64