In [1]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from factor_analyzer import ConfirmatoryFactorAnalyzer, ModelSpecificationParser
from scipy.stats import norm

current_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(current_dir))
PROCESSED_DATA_PATH = os.path.join(project_root, 'data', 'processed')
RAW_DATA_PATH = os.path.join(project_root, 'data', 'raw')
OUTPUT_PATH = os.path.join(project_root, 'output')

In [2]:
data = pd.read_excel(RAW_DATA_PATH + '/HPM data_environmental performance.xlsx')

In [3]:
environmental_practices = pd.DataFrame()
environmental_performance = pd.DataFrame()
jit_practices = pd.DataFrame()

for column in data.columns:
    if column.startswith('ENVRTX') or column.startswith('EPRACX'):
        environmental_practices[column] = data[column]

    if column.startswith('EPERFX'):
        environmental_performance[column] = data[column]

    if column.startswith('LAYOUT') or column.startswith('JITDEL') or column.startswith('KANBAN'):
        jit_practices[column] = data[column]

bundles = [jit_practices, environmental_practices, environmental_performance]

In [4]:
# drop rows with NA values
for bundle in bundles:
    bundle.dropna(inplace=True)

for bundle in bundles:
    print(bundle.shape)

(267, 10)
(243, 41)
(277, 9)


In [5]:
# Model specification
model_dict = {
    "Environmental Practices": [],
    "JIT Practices": [],
    "Environmental Performance": []
}

In [6]:
for key in data.keys():
    if key.startswith('ENVRTX') or key.startswith('EPRACX'):
        if key not in model_dict['Environmental Practices']:
            model_dict['Environmental Practices'].append(key)

    if key.startswith('LAYOUT') or key.startswith('JITDEL') or key.startswith('KANBAN'):
        if key not in model_dict['JIT Practices']:
            model_dict["JIT Practices"].append(key)

    if key.startswith('EPERFX'):
        if key not in model_dict['Environmental Performance']:
            model_dict["Environmental Performance"].append(key)

In [7]:
# Extract all column names from the model_dict
desired_columns = [col for sublist in model_dict.values() for col in sublist]
data_filtered = data[desired_columns]
data_filtered.dropna(inplace=True)

# Adjusted model specification
model_spec = ModelSpecificationParser.parse_model_specification_from_dict(data_filtered, model_dict)

# CFA model
cfa = ConfirmatoryFactorAnalyzer(model_spec)
cfa.fit(data_filtered)

# Extract the factor loadings
loadings = cfa.loadings_

# Get the standard errors for loadings and intercepts
se_all = cfa.get_standard_errors()

# Extract standard errors for loadings
se_loadings = se_all[0]

# Compute t-values
t_values = loadings / se_loadings

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered.dropna(inplace=True)


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          246     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.54548D+04    |proj g|=  3.29773D+02

At iterate    1    f=  1.48732D+04    |proj g|=  1.23630D+02


 This problem is unconstrained.



At iterate    2    f=  1.47037D+04    |proj g|=  1.04946D+02



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  246      3     44      2     0     0   1.049D+02   1.470D+04
  F =   14703.670714870899     

ABNORMAL_TERMINATION_IN_LNSRCH                              



 Line search cannot locate an adequate point after MAXLS
  function and gradient evaluations.
  Previous x, f and g restored.
 Possible causes: 1 error in function or gradient evaluation;
                  2 rounding error dominate computation.
  t_values = loadings / se_loadings


In [8]:
data = {
    'Bundle': [],
    'Code': [],
    'Loading': [],
    'SE': [],
    't-value': []
}

# Loop through each factor and then each item within that factor
for factor, items in model_dict.items():
    for item in items:
        item_idx = data_filtered.columns.get_loc(item)
        factor_idx = list(model_dict.keys()).index(factor)

        loading_value = loadings[item_idx][factor_idx]
        se_value = se_loadings[item_idx][factor_idx]
        t_value = t_values[item_idx][factor_idx]

        data['Bundle'].append(factor)
        data['Code'].append(item)  # Using column names as descriptions for now
        data['Loading'].append(loading_value)
        data['SE'].append(se_value)
        data['t-value'].append(t_value)

# Convert the data to a DataFrame
results_df = pd.DataFrame(data)
print(results_df)


                       Bundle       Code   Loading        SE    t-value
0     Environmental Practices   ENVRTX21  0.899477  0.066349  13.556741
1     Environmental Practices   ENVRTX37  0.950057  0.077204  12.305769
2     Environmental Practices   ENVRTX02  0.964870  0.069595  13.864168
3     Environmental Practices   ENVRTX22  0.919310  0.068068  13.505842
4     Environmental Practices   ENVRTX39  1.017784  0.073744  13.801560
5     Environmental Practices   ENVRTX23  0.887866  0.065242  13.608905
6     Environmental Practices   ENVRTX18  1.036917  0.073302  14.145885
7     Environmental Practices   ENVRTX13  0.967737  0.073823  13.108862
8     Environmental Practices   ENVRTX33  0.999124  0.077164  12.947977
9     Environmental Practices   ENVRTX03  0.895811  0.063026  14.213277
10    Environmental Practices   ENVRTX20  1.062294  0.074628  14.234455
11    Environmental Practices   ENVRTX38  1.046556  0.070912  14.758464
12    Environmental Practices   ENVRTX08  0.910765  0.064602  14

In [9]:
# Compute p-values from t-values
p_values = [2 * norm.sf(abs(t)) for t in data['t-value']]

# Add p-values to the results dataframe
results_df['p-value'] = p_values

results_df

Unnamed: 0,Bundle,Code,Loading,SE,t-value,p-value
0,Environmental Practices,ENVRTX21,0.899477,0.066349,13.556741,7.227675e-42
1,Environmental Practices,ENVRTX37,0.950057,0.077204,12.305769,8.432975e-35
2,Environmental Practices,ENVRTX02,0.96487,0.069595,13.864168,1.044303e-43
3,Environmental Practices,ENVRTX22,0.91931,0.068068,13.505842,1.4445289999999998e-41
4,Environmental Practices,ENVRTX39,1.017784,0.073744,13.80156,2.493956e-43
5,Environmental Practices,ENVRTX23,0.887866,0.065242,13.608905,3.545154e-42
6,Environmental Practices,ENVRTX18,1.036917,0.073302,14.145885,1.980103e-45
7,Environmental Practices,ENVRTX13,0.967737,0.073823,13.108862,2.9296020000000002e-39
8,Environmental Practices,ENVRTX33,0.999124,0.077164,12.947977,2.4122749999999997e-38
9,Environmental Practices,ENVRTX03,0.895811,0.063026,14.213277,7.579357e-46


In [10]:
# Assuming 'data' is a pandas DataFrame and 't-value' is a column in that DataFrame
significance_levels = []

for p in results_df['p-value']:
    if p < 0.001:
        significance_levels.append('***')
    elif p < 0.01:
        significance_levels.append('**')
    elif p < 0.05:
        significance_levels.append('*')
    else:
        significance_levels.append('')

# Add the formatted p-values and significance levels as new columns to the DataFrame
results_df['significance'] = significance_levels

# Display the DataFrame
results_df


Unnamed: 0,Bundle,Code,Loading,SE,t-value,p-value,significance
0,Environmental Practices,ENVRTX21,0.899477,0.066349,13.556741,7.227675e-42,***
1,Environmental Practices,ENVRTX37,0.950057,0.077204,12.305769,8.432975e-35,***
2,Environmental Practices,ENVRTX02,0.96487,0.069595,13.864168,1.044303e-43,***
3,Environmental Practices,ENVRTX22,0.91931,0.068068,13.505842,1.4445289999999998e-41,***
4,Environmental Practices,ENVRTX39,1.017784,0.073744,13.80156,2.493956e-43,***
5,Environmental Practices,ENVRTX23,0.887866,0.065242,13.608905,3.545154e-42,***
6,Environmental Practices,ENVRTX18,1.036917,0.073302,14.145885,1.980103e-45,***
7,Environmental Practices,ENVRTX13,0.967737,0.073823,13.108862,2.9296020000000002e-39,***
8,Environmental Practices,ENVRTX33,0.999124,0.077164,12.947977,2.4122749999999997e-38,***
9,Environmental Practices,ENVRTX03,0.895811,0.063026,14.213277,7.579357e-46,***


In [11]:
# Load the JSON file containing the shorter descriptions
with open(PROCESSED_DATA_PATH + '/codes.json', 'r') as f:
    codes = json.load(f)

# Create a mapping of original codes to descriptive names
code_to_description = {item['original_code']: item['original_description'] for item in codes}

# Assuming 'results_df' already exists and contains the DataFrame you provided
original_descriptions = [code_to_description.get(code, code) for code in results_df['Code']]

# Insert the 'Original Description' column between 'Bundle' and 'Code'
results_df.insert(loc=results_df.columns.get_loc('Code'), column='Original Description', value=original_descriptions)

# Now, 'results_df' contains the 'Original Description' column inserted between 'Bundle' and 'Code'

In [12]:
# To save the results to a CSV file
results_df.to_csv(PROCESSED_DATA_PATH + '/cfa.csv', index=False)