# Principal Components Analysis

## Preprocess

In [1]:
# load packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# from statsmodels.multivariate.pca import PCA

import warnings

warnings.filterwarnings("ignore")

In [2]:
# read file
adl_path = "../data/data_adl.pkl"
df = pd.read_pickle(adl_path)

## Function Definition

In [3]:
def pca_result(df, n=1):
    pca = PCA(n_components=n)
    df_nd = pca.fit_transform(df)
    df_recon = pca.inverse_transform(df_nd).round(0)
    df_recon[df_recon<=0] = 0
    df_recon_err = np.mean(np.sum(np.square(df_recon - df), axis=1))
    cols = [("Component" + str(i+1)) for i in range(pca.components_.shape[0])]
    components = pd.DataFrame(pca.components_.T, columns=cols, index=df.columns)
    explained_var = pca.explained_variance_ratio_.sum()
    result = {
        "pca": pca,
        "pca_components": components,
        "explained_variance": explained_var,
        "data_dimension_reduction": df_nd,
        "data_reconstruction": df_recon,
        "data_reconstruction_error": df_recon_err,
    }
    return result

## PCA

In [4]:
pca1 = pca_result(df, 1)
pca2 = pca_result(df, 2)
pca3 = pca_result(df, 3)

In [10]:
df

Unnamed: 0,H2a,H2b,H2c,H2d,H2e,H2f,H2g,H2h,H2i,H2j
0,0,1,0,3,0,1,0,0,0,1
1,5,5,6,6,5,5,5,5,5,5
2,0,1,1,3,1,2,0,2,2,2
3,0,0,1,3,0,0,0,1,2,1
4,0,1,1,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
25389,0,1,1,2,0,0,0,0,1,1
25390,0,0,0,1,0,0,0,0,0,0
25391,3,4,2,2,3,3,2,3,2,4
25392,3,3,3,3,2,2,1,3,1,2


In [16]:
df_pca1 = np.array(df) @ np.array(pca1["pca_components"])
df_pca2 = np.array(df) @ np.array(pca2["pca_components"])
df_pca3 = np.array(df) @ np.array(pca3["pca_components"])

In [17]:
with open("../npy/data_adl_pca1.npy", 'wb') as f:
    np.save(f, df_pca1)
with open("../npy/data_adl_pca2.npy", 'wb') as f:
    np.save(f, df_pca2)
with open("../npy/data_adl_pca3.npy", 'wb') as f:
    np.save(f, df_pca3)

In [5]:
pca = PCA(n_components="mle")
pca.fit(df)
pca.n_components_

9