In [45]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
from scipy.stats import norm
from scipy.stats import t

current_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(current_dir))
PROCESSED_DATA_PATH = os.path.join(project_root, "data", "processed")
RAW_DATA_PATH = os.path.join(project_root, "data", "raw")
OUTPUT_PATH = os.path.join(project_root, "output")
REPORTS_TABLES_PATH = os.path.join(project_root, "reports", "tables")

In [46]:
data = pd.read_excel(RAW_DATA_PATH + "/HPM data_environmental performance.xlsx")
data = data.dropna()

In [47]:
for column in data.columns:
    if column.startswith('OUTCMX'):
        # drop column
        data = data.drop(column, axis=1)

In [49]:
df = data
df.drop(['COMPANY CODE', 'INDUSTRY', 'ACCTGX51'], axis=1, inplace=True)

In [50]:
numeric_cols = df.select_dtypes(include=[float, int]).columns
df_numeric = df[numeric_cols]

In [152]:
n_factors=4

# Perform factor analysis using the number of factors you decided on
fa = FactorAnalyzer(n_factors=n_factors, rotation="varimax")
fa.fit(df_numeric)

# Check the loadings (correlation coefficients between observed variables and latent common factors)
loadings = fa.loadings_

# Create DataFrame for loadings
loadings_df = pd.DataFrame(loadings, columns=[f'Factor {i+1}' for i in range(n_factors)],
                           index=df_numeric.columns)
# print(loadings)

# Get variance of each factors
fa_variance = fa.get_factor_variance()
# print(fa_variance)


In [153]:
loadings_df = pd.DataFrame(loadings, columns=[f'Factor {i+1}' for i in range(loadings.shape[1])],
                            index=df_numeric.columns)

# Create a DataFrame to store the results similar to the one you used for CFA
data = {
    'Item': [],
    'Factor1 Loading': [],
    'Factor2 Loading': [],  
    'Factor3 Loading': [],  
    'Factor4 Loading': [],
    'Communalities': [],
    'Uniqueness': []
}

for item in df_numeric.columns:
    item_loadings = loadings_df.loc[item].values  # Loadings for this item
    item_comm = fa.get_communalities()[df_numeric.columns.get_loc(item)]  # Communality for this item
    item_uniq = fa.get_uniquenesses()[df_numeric.columns.get_loc(item)]  # Uniqueness for this item

    data['Item'].append(item)
    data['Factor1 Loading'].append(item_loadings[0])
    data['Factor2 Loading'].append(item_loadings[1])  
    data['Factor3 Loading'].append(item_loadings[2]) 
    data['Factor4 Loading'].append(item_loadings[3])
    data['Communalities'].append(item_comm)
    data['Uniqueness'].append(item_uniq)

# Convert the data to a DataFrame
results_df = pd.DataFrame(data)

In [154]:
results_df

Unnamed: 0,Item,Factor1 Loading,Factor2 Loading,Factor3 Loading,Factor4 Loading,Communalities,Uniqueness
0,ENVRTX21,0.466959,0.269219,0.096065,-0.028572,0.300575,0.699425
1,ENVRTX37,0.035086,0.611979,0.099528,0.060900,0.389364,0.610636
2,ENVRTX02,0.528713,0.315107,0.075925,0.190885,0.421031,0.578969
3,ENVRTX22,0.522113,0.284784,0.040687,0.195948,0.393755,0.606245
4,ENVRTX39,0.505989,0.384795,0.139262,0.113055,0.436268,0.563732
...,...,...,...,...,...,...,...
66,SCHEDN02,0.046864,0.044748,0.590540,0.055828,0.356053,0.643947
67,SCHEDR03,0.101961,-0.205958,-0.165573,0.092640,0.088811,0.911189
68,SETUPN01,0.173492,0.041694,0.483328,0.033342,0.266556,0.733444
69,SETUPN02,-0.038365,0.128977,0.490261,0.185677,0.292938,0.707062


In [70]:
results_df

Unnamed: 0,Item,Factor1 Loading,Factor2 Loading,Factor3 Loading,Factor4 Loading,Factor5 Loading,Factor6 Loading,Factor7 Loading,Communalities,Uniqueness
0,ENVRTX21,0.532621,0.150151,-0.004066,0.134042,-0.151002,0.199095,-0.021661,0.387124,0.612876
1,ENVRTX37,0.140435,0.602282,0.060092,0.084481,0.034121,0.059155,-0.057759,0.401213,0.598787
2,ENVRTX02,0.586812,0.234817,0.202250,0.165902,0.013889,-0.173368,-0.005486,0.498194,0.501806
3,ENVRTX22,0.601928,0.140007,0.230969,0.095113,-0.153372,0.134791,-0.067327,0.490537,0.509463
4,ENVRTX39,0.598789,0.255328,0.144016,0.146331,0.003950,0.075800,-0.052536,0.474416,0.525584
...,...,...,...,...,...,...,...,...,...,...
66,SCHEDN02,0.014485,0.092668,0.033279,0.598005,0.150265,0.100850,0.051707,0.402938,0.597062
67,SCHEDR03,0.067118,-0.210256,0.088075,-0.051575,-0.142441,-0.153667,-0.004559,0.103053,0.896947
68,SETUPN01,0.166175,0.049954,0.018739,0.528237,0.122216,0.003138,0.005951,0.324477,0.675523
69,SETUPN02,-0.042762,0.182598,0.160871,0.568166,0.043584,0.054853,-0.040493,0.390410,0.609590


In [71]:
csv_path = PROCESSED_DATA_PATH + "/7_factor_efa.csv"
# To save the results to a CSV file
results_df.to_csv(csv_path, index=False)

In [150]:
def dataframe_to_latex(df, file_name, label="tab:my_label", caption="My Caption"):
    """
    Convert a pandas DataFrame to a LaTeX table with specific column modifications.
    The function formats numeric columns, highlights certain values, and saves the table to a file.
    LaTeX packages like 'colortbl' should be included in the preamble of your LaTeX document.

    Parameters:
    - df: pandas DataFrame to convert.
    - file_name: Name of the file to save the LaTeX code.
    - label: Label for the table in LaTeX.
    - caption: Caption for the table in LaTeX.
    """
    # Format and highlight factor loading columns
    factor_loading_cols = [col for col in df.columns if 'Factor' in col and 'Loading' in col]
    for col in factor_loading_cols:
        df[col] = df[col].astype(float).round(2)
        df[col] = df.apply(lambda x: "\\cellcolor{yellow}" + str(x[col]) if x[col] > 0.50 else str(x[col]), axis=1)

    # Define the column format with adjusted widths
    num_columns = len(df.columns)
    column_format = 'l' + 'l' * (num_columns - 1)  # Adjust this based on your DataFrame structure
    df = df.rename(columns={'Item': 'HPM Code', 'Communalities': 'Communality', 'Uniqueness': 'Uniqueness'})
    df = df.rename(columns={'Factor1 Loading': 'Factor 1', 'Factor2 Loading': 'Factor 2', 'Factor3 Loading': 'Factor 3', 'Factor4 Loading': 'Factor 4', 'Factor5 Loading': 'Factor 5', 'Factor6 Loading': 'Factor 6', 'Factor7 Loading': 'Factor 7'})
    # Start the landscape page and set the font size to small for the table
    latex_code = "\\begin{landscape}\n\\small\n"
    latex_code += df.to_latex(index=False, longtable=True, caption=caption, label=label, 
                              column_format=column_format, header=True, escape=False)
    latex_code += "\n\\end{landscape}"

    # Save to file
    with open(file_name, 'w') as file:
        file.write(latex_code)

# Use the function with your DataFrame
# Example: dataframe_to_latex(your_dataframe, 'your_file_name.tex')


In [151]:
latex_file_name = REPORTS_TABLES_PATH + '/7_factor_efa.tex'
df = pd.read_csv(csv_path) 
dataframe_to_latex(df, latex_file_name, label="tab:EFA", caption="Exploratory Factor Analysis - Highlighting loadings > 0.5")