In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import FactorAnalysis
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('../data/modelinput/information_governance_full_dataset_raw.csv')

In [None]:
rela_cols = {
    'pp_ambiquity_score_standardized_reverse': 'Ambiquity (standardized & inversed)',
    'pp_gunning_fog_index_standardized_reverse': 'Readability (standardized & inversed)',
    'pp_contact_option': 'Contact option',
    'pp_existence_of_a_transparency_report': 'Transparency report',
}
form_cols = {
    'dummy_pp_legislation_complied_with_standard_reverse': 'Legislation applied (inversed)',
    'pp_third_party_sharing_reverse': 'Third part sharing (inversed)',
    'pp_existence_of_a_data_protection_officer': 'Data Protection Officer',
    'pp_iso_type': 'ISO security system',
}

def conduct_pca(dataf, cols, name, n_pcas):
    
    pca = PCA(n_components=n_pcas)
    
    components = pca.fit_transform(dataf[cols.keys()])
    
    # dataf[f'pca_{name}'] = components[:,0:n_pcas].sum(axis=1)
    dataf[f'pca_{name}'] = components[:,0] * pca.explained_variance_ratio_[0] + \
                           components[:,1] * pca.explained_variance_ratio_[1] + \
                           components[:,2] * pca.explained_variance_ratio_[2] + \
                           components[:,3] * pca.explained_variance_ratio_[3]
    
    print('Explained variance by component: %s' % pca.explained_variance_ratio_)
    print(f'Explained variance by component 1: {round(pca.explained_variance_ratio_[0]*100, 3)}%')
    print(f'Total variance explained by all components: {round(sum(pca.explained_variance_ratio_)*100, 3)}%')
    
    df_eigv = pd.DataFrame([["{:.2f}".format(x) for x in pca.explained_variance_]],
                           columns=[f'PC{n}' for n in range(1, n_pcas+1)],
                           index=['Eigenvalue'])
    
    df_expl = pd.DataFrame([["{:.2f}".format(x*100)+ "%" for x in pca.explained_variance_ratio_]],
                           columns=[f'PC{n}' for n in range(1, n_pcas+1)],
                           index=['Percentage of variance explained'])
    
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
    df_loadings = pd.DataFrame(loadings.round(3),
                               columns=[f'PC{n}' for n in range(1, n_pcas+1)],
                               index=[f'  {c}' for c in cols.values()])
    
    report_table = pd.concat([df_loadings, df_eigv, df_expl])
    
    return report_table, loadings

In [None]:
df_report, _ = conduct_pca(df, rela_cols, 'composite_relational_ig_practises', 4)

In [None]:
df_report

In [None]:
df_report, _ = conduct_pca(df, form_cols, 'composite_formal_ig_practises', 4)

In [None]:
df_report

In [None]:
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = [12, 8]