In [1]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
from scipy.stats import norm
from scipy.stats import t

current_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(current_dir))
PROCESSED_DATA_PATH = os.path.join(project_root, "data", "processed")
RAW_DATA_PATH = os.path.join(project_root, "data", "raw")
OUTPUT_PATH = os.path.join(project_root, "output")
REPORTS_TABLES_PATH = os.path.join(project_root, "reports", "tables")

In [2]:
data = pd.read_excel(RAW_DATA_PATH + "/HPM data_environmental performance.xlsx")
data = data.dropna()

In [3]:
for column in data.columns:
    if column.startswith('OUTCMX'):
        # drop column
        data = data.drop(column, axis=1)

In [4]:
df = data 

In [5]:
numeric_cols = df.select_dtypes(include=[float, int]).columns
df_numeric = df[numeric_cols]

In [6]:
n_factors=3

# Perform factor analysis using the number of factors you decided on
fa = FactorAnalyzer(n_factors=n_factors, rotation="varimax")
fa.fit(df_numeric)

# Check the loadings (correlation coefficients between observed variables and latent common factors)
loadings = fa.loadings_

# Create DataFrame for loadings
loadings_df = pd.DataFrame(loadings, columns=[f'Factor {i+1}' for i in range(n_factors)],
                           index=df_numeric.columns)
# print(loadings)

# Get variance of each factors
fa_variance = fa.get_factor_variance()
# print(fa_variance)


In [7]:
loadings_df = pd.DataFrame(loadings, columns=[f'Factor {i+1}' for i in range(loadings.shape[1])],
                            index=df_numeric.columns)

# Create a DataFrame to store the results similar to the one you used for CFA
data = {
    'Item': [],
    'Factor1 Loading': [],
    'Factor2 Loading': [],  
    'Factor3 Loading': [],  
    'Communalities': [],
    'Uniqueness': []
}

for item in df_numeric.columns:
    item_loadings = loadings_df.loc[item].values  # Loadings for this item
    item_comm = fa.get_communalities()[df_numeric.columns.get_loc(item)]  # Communality for this item
    item_uniq = fa.get_uniquenesses()[df_numeric.columns.get_loc(item)]  # Uniqueness for this item

    data['Item'].append(item)
    data['Factor1 Loading'].append(item_loadings[0])
    data['Factor2 Loading'].append(item_loadings[1])  
    data['Factor3 Loading'].append(item_loadings[2]) 
    data['Communalities'].append(item_comm)
    data['Uniqueness'].append(item_uniq)

# Convert the data to a DataFrame
results_df = pd.DataFrame(data)

In [8]:
results_df

Unnamed: 0,Item,Factor1 Loading,Factor2 Loading,Factor3 Loading,Communalities,Uniqueness
0,COMPANY CODE,0.033973,0.130765,0.098606,0.027977,0.972023
1,INDUSTRY,0.037026,-0.056594,0.084929,0.011787,0.988213
2,ENVRTX21,0.539807,0.096528,-0.026116,0.301391,0.698609
3,ENVRTX37,0.349367,0.178554,0.172325,0.183635,0.816365
4,ENVRTX02,0.609755,0.077487,0.195079,0.415861,0.584139
...,...,...,...,...,...,...
69,SCHEDN02,0.051838,0.584383,0.047019,0.346401,0.653599
70,SCHEDR03,-0.024440,-0.201190,0.045655,0.043159,0.956841
71,SETUPN01,0.158355,0.477082,0.015459,0.252923,0.747077
72,SETUPN02,0.020886,0.503967,0.203888,0.295989,0.704011


In [9]:
# Load the JSON file containing the shorter descriptions
with open(PROCESSED_DATA_PATH + '/codes.json', 'r') as f:
    codes = json.load(f)

# Create a mapping of original codes to descriptive names
code_to_name = {item['original_code']: item['original_description'] for item in codes}

results_df['descriptive_name'] = results_df['Item'].map(code_to_name)

In [10]:
results_df

Unnamed: 0,Item,Factor1 Loading,Factor2 Loading,Factor3 Loading,Communalities,Uniqueness,descriptive_name
0,COMPANY CODE,0.033973,0.130765,0.098606,0.027977,0.972023,
1,INDUSTRY,0.037026,-0.056594,0.084929,0.011787,0.988213,
2,ENVRTX21,0.539807,0.096528,-0.026116,0.301391,0.698609,Environmentally preferable packaging for the p...
3,ENVRTX37,0.349367,0.178554,0.172325,0.183635,0.816365,Using a third party to monitor working conditi...
4,ENVRTX02,0.609755,0.077487,0.195079,0.415861,0.584139,Water efficiency
...,...,...,...,...,...,...,...
69,SCHEDN02,0.051838,0.584383,0.047019,0.346401,0.653599,We usually complete our daily schedule as plan...
70,SCHEDR03,-0.024440,-0.201190,0.045655,0.043159,0.956841,"We build extra slack into our daily schedule, ..."
71,SETUPN01,0.158355,0.477082,0.015459,0.252923,0.747077,We are aggressively working to lower setup tim...
72,SETUPN02,0.020886,0.503967,0.203888,0.295989,0.704011,We have low setup times of equipment in our pl...


In [11]:
# To save the results to a CSV file
results_df.to_csv(OUTPUT_PATH + '/total_efa.csv', index=False)