In [44]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from factor_analyzer import ConfirmatoryFactorAnalyzer, ModelSpecificationParser
from scipy.stats import norm

current_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(current_dir))
PROCESSED_DATA_PATH = os.path.join(project_root, "data", "processed")
RAW_DATA_PATH = os.path.join(project_root, "data", "raw")
OUTPUT_PATH = os.path.join(project_root, "output")
REPORTS_TABLES_PATH = os.path.join(project_root, "reports", "tables")

In [2]:
data = pd.read_excel(RAW_DATA_PATH + "/HPM data_environmental performance.xlsx")

In [3]:
environmental_practices = pd.DataFrame()
environmental_performance = pd.DataFrame()
sustainability_outcomes = pd.DataFrame()
jit_practices = pd.DataFrame()

for column in data.columns:
    if column.startswith("ENVRTX") or column.startswith("EPRACX"):
        environmental_practices[column] = data[column]

    if column.startswith("EPERFX"):
        environmental_performance[column] = data[column]

    if column.startswith("OUTCMX"):
        sustainability_outcomes[column] = data[column]

    if (
        column.startswith("LAYOUT")
        or column.startswith("JITDEL")
        or column.startswith("KANBAN")
        or column.startswith("LINKCN")
        or column.startswith("SCHEDN")
        or column.startswith("SETUPN")
    ):
        jit_practices[column] = data[column]

bundles = [
    jit_practices,
    environmental_practices,
    environmental_performance,
    sustainability_outcomes,
]

In [4]:
# drop rows with NA values
for bundle in bundles:
    bundle.dropna(inplace=True)

for bundle in bundles:
    print(bundle.shape)

(261, 20)
(243, 41)
(277, 9)
(279, 17)


In [5]:
# Model specification
model_dict = {
    "Environmental Practices": [],
    "JIT Practices": [],
    "Environmental Performance": []
    # "Sustainability Outcomes": []
}

In [6]:
for key in data.keys():
    if key.startswith("ENVRTX") or key.startswith("EPRACX"):
        if key not in model_dict["Environmental Practices"]:
            model_dict["Environmental Practices"].append(key)

    if key.startswith("LAYOUT") or key.startswith("JITDEL") or key.startswith("KANBAN") or key.startswith("LINKCN") or key.startswith("SCHEDN") or key.startswith("SETUPN"):
        if key not in model_dict["JIT Practices"]:
            model_dict["JIT Practices"].append(key)

    if key.startswith("EPERFX"):
        if key not in model_dict["Environmental Performance"]:
            model_dict["Environmental Performance"].append(key)

    # if key.startswith('OUTCMX'):
    #     if key not in model_dict['Sustainability Outcomes']:
    #         model_dict["Sustainability Outcomes"].append(key)

In [7]:
# Extract all column names from the model_dict
desired_columns = [col for sublist in model_dict.values() for col in sublist]
data_filtered = data[desired_columns]
data_filtered.dropna(inplace=True)

# Adjusted model specification
model_spec = ModelSpecificationParser.parse_model_specification_from_dict(
    data_filtered, model_dict
)

# CFA model
cfa = ConfirmatoryFactorAnalyzer(model_spec)
cfa.fit(data_filtered)

# Extract the factor loadings
loadings = cfa.loadings_

# Get the standard errors for loadings and intercepts
se_all = cfa.get_standard_errors()

# Extract standard errors for loadings
se_loadings = se_all[0]

# Compute t-values
t_values = loadings / se_loadings

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered.dropna(inplace=True)


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          286     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.79539D+04    |proj g|=  3.27767D+02


 This problem is unconstrained.



At iterate    1    f=  1.73053D+04    |proj g|=  1.19774D+02

At iterate    2    f=  1.70074D+04    |proj g|=  2.40716D+02

At iterate    3    f=  1.69413D+04    |proj g|=  1.22384D+02

At iterate    4    f=  1.68929D+04    |proj g|=  1.20711D+02

At iterate    5    f=  1.68469D+04    |proj g|=  9.03921D+01

At iterate    6    f=  1.68210D+04    |proj g|=  8.23144D+01



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  286      7     49      2     0     0   8.231D+01   1.682D+04
  F =   16820.965014976580     

ABNORMAL_TERMINATION_IN_LNSRCH                              



 Line search cannot locate an adequate point after MAXLS
  function and gradient evaluations.
  Previous x, f and g restored.
 Possible causes: 1 error in function or gradient evaluation;
                  2 rounding error dominate computation.
  t_values = loadings / se_loadings


In [8]:
data = {"Bundle": [], "Code": [], "Loading": [], "SE": [], "t-value": []}

# Loop through each factor and then each item within that factor
for factor, items in model_dict.items():
    for item in items:
        item_idx = data_filtered.columns.get_loc(item)
        factor_idx = list(model_dict.keys()).index(factor)

        loading_value = loadings[item_idx][factor_idx]
        se_value = se_loadings[item_idx][factor_idx]
        t_value = t_values[item_idx][factor_idx]

        data["Bundle"].append(factor)
        data["Code"].append(item)  # Using column names as descriptions for now
        data["Loading"].append(loading_value)
        data["SE"].append(se_value)
        data["t-value"].append(t_value)

# Convert the data to a DataFrame
results_df = pd.DataFrame(data)
print(results_df)

                       Bundle      Code   Loading        SE    t-value
0     Environmental Practices  ENVRTX21  0.630869  0.056775  11.111819
1     Environmental Practices  ENVRTX37  0.801641  0.082212   9.750851
2     Environmental Practices  ENVRTX02  0.880841  0.067925  12.967847
3     Environmental Practices  ENVRTX22  0.693499  0.060621  11.439927
4     Environmental Practices  ENVRTX39  1.063097  0.078747  13.500101
..                        ...       ...       ...       ...        ...
65  Environmental Performance  EPERFX05  0.894034  0.056995  15.686117
66  Environmental Performance  EPERFX06  0.810767  0.056368  14.383432
67  Environmental Performance  EPERFX07  0.696855  0.051512  13.527908
68  Environmental Performance  EPERFX08  0.594950  0.050840  11.702306
69  Environmental Performance  EPERFX09  0.843766  0.072924  11.570442

[70 rows x 5 columns]


In [9]:
# Compute p-values from t-values
p_values = [2 * norm.sf(abs(t)) for t in data["t-value"]]

# Add p-values to the results dataframe
results_df["p-value"] = p_values

results_df

Unnamed: 0,Bundle,Code,Loading,SE,t-value,p-value
0,Environmental Practices,ENVRTX21,0.630869,0.056775,11.111819,1.098983e-28
1,Environmental Practices,ENVRTX37,0.801641,0.082212,9.750851,1.829275e-22
2,Environmental Practices,ENVRTX02,0.880841,0.067925,12.967847,1.861873e-38
3,Environmental Practices,ENVRTX22,0.693499,0.060621,11.439927,2.641090e-30
4,Environmental Practices,ENVRTX39,1.063097,0.078747,13.500101,1.561609e-41
...,...,...,...,...,...,...
65,Environmental Performance,EPERFX05,0.894034,0.056995,15.686117,1.882218e-55
66,Environmental Performance,EPERFX06,0.810767,0.056368,14.383432,6.575097e-47
67,Environmental Performance,EPERFX07,0.696855,0.051512,13.527908,1.070256e-41
68,Environmental Performance,EPERFX08,0.594950,0.050840,11.702306,1.240365e-31


In [10]:
# Assuming 'data' is a pandas DataFrame and 't-value' is a column in that DataFrame
significance_levels = []

for p in results_df["p-value"]:
    if p < 0.001:
        significance_levels.append("***")
    elif p < 0.01:
        significance_levels.append("**")
    elif p < 0.05:
        significance_levels.append("*")
    else:
        significance_levels.append("")

# Add the formatted p-values and significance levels as new columns to the DataFrame
results_df["significance"] = significance_levels

# Display the DataFrame
results_df

Unnamed: 0,Bundle,Code,Loading,SE,t-value,p-value,significance
0,Environmental Practices,ENVRTX21,0.630869,0.056775,11.111819,1.098983e-28,***
1,Environmental Practices,ENVRTX37,0.801641,0.082212,9.750851,1.829275e-22,***
2,Environmental Practices,ENVRTX02,0.880841,0.067925,12.967847,1.861873e-38,***
3,Environmental Practices,ENVRTX22,0.693499,0.060621,11.439927,2.641090e-30,***
4,Environmental Practices,ENVRTX39,1.063097,0.078747,13.500101,1.561609e-41,***
...,...,...,...,...,...,...,...
65,Environmental Performance,EPERFX05,0.894034,0.056995,15.686117,1.882218e-55,***
66,Environmental Performance,EPERFX06,0.810767,0.056368,14.383432,6.575097e-47,***
67,Environmental Performance,EPERFX07,0.696855,0.051512,13.527908,1.070256e-41,***
68,Environmental Performance,EPERFX08,0.594950,0.050840,11.702306,1.240365e-31,***


In [11]:
# Load the JSON file containing the shorter descriptions
with open(PROCESSED_DATA_PATH + "/codes.json", "r") as f:
    codes = json.load(f)

# Create a mapping of original codes to descriptive names
code_to_description = {
    item["original_code"]: item["original_description"] for item in codes
}

# Assuming 'results_df' already exists and contains the DataFrame you provided
original_descriptions = [
    code_to_description.get(code, code) for code in results_df["Code"]
]

# Insert the 'Original Description' column between 'Bundle' and 'Code'
results_df.insert(
    loc=results_df.columns.get_loc("Code"),
    column="Original Description",
    value=original_descriptions,
)

# Now, 'results_df' contains the 'Original Description' column inserted between 'Bundle' and 'Code'

In [28]:
# To save the results to a CSV file
results_df.to_csv(PROCESSED_DATA_PATH + "/cfa.csv", index=False)

In [48]:
def dataframe_to_latex(df, file_name, label="tab:my_label", caption="My Caption"):
    """
    Convert a pandas DataFrame to a LaTeX table with specific column modifications,
    save it to a file with landscape orientation, apply text wrapping to the
    'Item Description' column, show the 'Bundle' name only once for each group,
    format numeric columns to two decimal places, and remove all internal borders.

    Parameters:
    - df: pandas DataFrame to convert.
    - file_name: Name of the file to save the LaTeX code.
    - label: Label for the table in LaTeX.
    - caption: Caption for the table in LaTeX.
    """
    # Rename columns
    df.rename(columns={'Original Description': 'Item Description', 'Code': 'HPM Code'}, inplace=True)
    
    # Format 'SE' and 't-value' to two decimal places
    df['SE'] = df['SE'].round(2).astype(str)
    df['t-value'] = df['t-value'].round(2).astype(str)
    
    # Combine 'Significance' with 'Loading', rounded to two decimal places
    df['Loading'] = df['Loading'].round(2).astype(str) + df['significance'].fillna('')
    
    # Drop the 'p-value' and 'significance' columns from the dataframe
    df.drop(['p-value', 'significance'], axis=1, inplace=True)
    
    # Move 'HPM Code' column after 'Bundle'
    cols = ['Bundle', 'HPM Code', 'Item Description', 'Loading', 'SE', 't-value']  # Update this list as needed
    df = df[cols]
    
    # Replace repeated 'Bundle' names with empty string
    df['Bundle'] = df['Bundle'].where(df['Bundle'] != df['Bundle'].shift(), '')
    
    # Define the column format without internal borders
    column_format = 'l@{\hspace{6pt}}l@{\hspace{6pt}}p{11cm}@{\hspace{6pt}}l@{\hspace{6pt}}l@{\hspace{6pt}}l'
    
    # Start the landscape page and set the font size to small for the table
    latex_code = "\\begin{landscape}\n\\small\n"
    
    # Add the longtable environment with specified format
    latex_code += df.to_latex(index=False, longtable=True, caption=caption, label=label,
                              column_format=column_format, header=True, escape=False)
    
    # End the landscape page
    latex_code += "\n\\end{landscape}"
    
    # Save to file
    with open(file_name, 'w') as file:
        file.write(latex_code)


In [49]:
csv_path = PROCESSED_DATA_PATH + "/cfa.csv"
latex_file_name = REPORTS_TABLES_PATH + '/cfa.tex'
df = pd.read_csv(csv_path) 
dataframe_to_latex(df, latex_file_name, label="tab:your_label", caption="Confirmatory Factor Analysis")