# First Exploratory Data Analysis 

Import necessary dependencies

In [34]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from factor_analyzer import ConfirmatoryFactorAnalyzer, ModelSpecificationParser
from scipy.stats import norm

Set directories for project and data folders

In [15]:
current_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(current_dir))
PROCESSED_DATA_PATH = os.path.join(project_root, 'data', 'processed')
RAW_DATA_PATH = os.path.join(project_root, 'data', 'raw')
OUTPUT_PATH = os.path.join(project_root, 'output')

Load descriptions and descriptive names for scale

In [27]:
# Load the JSON file containing the shorter descriptions
with open(PROCESSED_DATA_PATH + '/codes.json', 'r') as f:
    codes = json.load(f)

# Create a mapping of original codes to descriptive names
code_to_name = {item['original_code']: item['descriptive_name'] for item in codes}

# Load the data (assuming you have a CSV file named 'data.csv')
data = pd.read_excel(RAW_DATA_PATH + '/HPM data_environmental performance.xlsx')

# Rename columns using the shorter descriptions
data.rename(columns=code_to_name, inplace=True)

# Display the first few rows of the dataset to get a feel for the data
print(data.head())

# Summary statistics
print("\nSummary Statistics:\n")
print(data.describe())

# Check for missing values
print("\nMissing Values:\n")
print(data.isnull().sum())

  COUNTRY  COMPANY CODE  INDUSTRY  Environmentally Preferable Packaging  \
0     BRA          1701         3                                   4.0   
1     BRA          1702         1                                   NaN   
2     BRA          1703         1                                   3.0   
3     BRA          1704         3                                   2.0   
4     BRA          1705         3                                   2.0   

   Third Party Monitoring of Supplier Working Conditions  Water Efficiency  \
0                                                2.0                   4.0   
1                                                NaN                   NaN   
2                                                2.0                   3.0   
3                                                4.0                   2.0   
4                                                1.0                   3.0   

   Substituting Environmentally Preferable Direct Materials  \
0                

In [17]:
len(codes)

68

In [None]:

# Visualize the distribution of numerical variables
numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()

for feature in numerical_features:
    plt.figure(figsize=(8,5))
    sns.histplot(data[feature], bins=50, kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()

# Correlation heatmap for numerical features
plt.figure(figsize=(12,10))
sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Normalization (if needed)
# Here, we'll use Min-Max normalization as an example
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_normalized = pd.DataFrame(scaler.fit_transform(data[numerical_features]), columns=numerical_features)

# Check normalized data
print("\nNormalized Data:\n")
print(data_normalized.head())


In [28]:
# Drop rows with missing values (or you can choose other imputation methods)
df_cleaned = data.dropna(subset=['Environmentally Preferable Packaging', 'Third Party Monitoring of Supplier Working Conditions',
                              'Water Efficiency', 'Substituting Environmentally Preferable Direct Materials',
                              'Equipment Layout - Low Inventories', 'Equipment Layout - Minimized Handling',
                              'Equipment Layout - JIT Production', 'JIT Delivery by Suppliers - Timely Delivery',
                              'JIT Delivery by Suppliers - Daily Shipments', 'JIT Delivery by Suppliers - Pull System',
                              'Kanban - Supplier Containers', 'Kanban - Production Control Pull System',
                              'Kanban - Production Control Signals'])

# Assuming df_cleaned is the dataframe after dropping NaN values
numeric_cols = df_cleaned.select_dtypes(include=[float, int]).columns
df_numeric = df_cleaned[numeric_cols]

# Model specification
model_dict = {
    "Environmental Practices": ['Environmentally Preferable Packaging', 'Third Party Monitoring of Supplier Working Conditions',
                               'Water Efficiency', 'Substituting Environmentally Preferable Direct Materials'],
    "Equipment Layout": ['Equipment Layout - Low Inventories', 'Equipment Layout - Minimized Handling',
                         'Equipment Layout - JIT Production'],
    "JIT & Kanban": ['JIT Delivery by Suppliers - Timely Delivery', 'JIT Delivery by Suppliers - Daily Shipments',
                     'JIT Delivery by Suppliers - Pull System', 'Kanban - Supplier Containers',
                     'Kanban - Production Control Pull System', 'Kanban - Production Control Signals']
}

# Extract all column names from the model_dict
desired_columns = [col for sublist in model_dict.values() for col in sublist]

# Filter df_numeric to include only those columns
df_numeric_filtered = df_numeric[desired_columns]

# Ensure model_dict keys and values are within numeric_cols
filtered_model_dict = {k: [col for col in v if col in numeric_cols] for k, v in model_dict.items() if all(col in numeric_cols for col in v)}

In [29]:
# Adjusted model specification
model_spec = ModelSpecificationParser.parse_model_specification_from_dict(df_numeric_filtered, filtered_model_dict)

# CFA model
cfa = ConfirmatoryFactorAnalyzer(model_spec)
cfa.fit(df_numeric_filtered)

# Extract the factor loadings
loadings = cfa.loadings_

# Get the standard errors for loadings and intercepts
se_all = cfa.get_standard_errors()

# Extract standard errors for loadings
se_loadings = se_all[0]

# Compute t-values
t_values = loadings / se_loadings

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           58     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.87139D+03    |proj g|=  2.92324D+02

At iterate    1    f=  4.63657D+03    |proj g|=  5.75748D+01

At iterate    2    f=  4.59542D+03    |proj g|=  4.52903D+01

At iterate    3    f=  4.56131D+03    |proj g|=  5.49286D+01

At iterate    4    f=  4.55509D+03    |proj g|=  5.54188D+01

At iterate    5    f=  4.54422D+03    |proj g|=  4.03441D+01

At iterate    6    f=  4.54098D+03    |proj g|=  1.16618D+01

At iterate    7    f=  4.53896D+03    |proj g|=  2.11840D+01

At iterate    8    f=  4.53740D+03    |proj g|=  1.46292D+01

At iterate    9    f=  4.53404D+03    |proj g|=  1.44382D+01

At iterate   10    f=  4.53178D+03    |proj g|=  1.59795D+01

At iterate   11    f=  4.52995D+03    |proj g|=  1.31166D+01

At iterate   12    f=  4.52837D+03    |proj g|=  1.32089D+01

At iterate   13    f=  4.5

  t_values = loadings / se_loadings


In [31]:
data = {
    'Bundle': [],
    'Item description': [],
    'Loading': [],
    'SE': [],
    't-value': []
}

# Loop through each factor and then each item within that factor
for factor, items in model_dict.items():
    for item in items:
        item_idx = df_numeric_filtered.columns.get_loc(item)
        factor_idx = list(model_dict.keys()).index(factor)

        loading_value = loadings[item_idx][factor_idx]
        se_value = se_loadings[item_idx][factor_idx]
        t_value = t_values[item_idx][factor_idx]

        data['Bundle'].append(factor)
        data['Item description'].append(item)  # Using column names as descriptions for now
        data['Loading'].append(loading_value)
        data['SE'].append(se_value)
        data['t-value'].append(t_value)

# Convert the data to a DataFrame
results_df = pd.DataFrame(data)
print(results_df)


                     Bundle  \
0   Environmental Practices   
1   Environmental Practices   
2   Environmental Practices   
3   Environmental Practices   
4          Equipment Layout   
5          Equipment Layout   
6          Equipment Layout   
7              JIT & Kanban   
8              JIT & Kanban   
9              JIT & Kanban   
10             JIT & Kanban   
11             JIT & Kanban   
12             JIT & Kanban   

                                     Item description   Loading        SE  \
0                Environmentally Preferable Packaging  0.447683  0.059621   
1   Third Party Monitoring of Supplier Working Con...  0.540919  0.082463   
2                                    Water Efficiency  0.542843  0.065713   
3   Substituting Environmentally Preferable Direct...  0.646634  0.074210   
4                  Equipment Layout - Low Inventories  0.736921  0.052824   
5               Equipment Layout - Minimized Handling  0.757169  0.056553   
6                   Equipm

In [32]:
# To save the results to a CSV file
results_df.to_csv(OUTPUT_PATH + '/cfa_results.csv', index=False)

In [36]:
# Compute p-values from t-values
p_values = [2 * (1 - norm.cdf(abs(t))) for t in data['t-value']]  # Two-tailed test

# Add p-values to the results dataframe
results_df['p-value'] = p_values

results_df

Unnamed: 0,Bundle,Item description,Loading,SE,t-value,p-value
0,Environmental Practices,Environmentally Preferable Packaging,0.447683,0.059621,7.508862,5.973e-14
1,Environmental Practices,Third Party Monitoring of Supplier Working Con...,0.540919,0.082463,6.559526,5.397904e-11
2,Environmental Practices,Water Efficiency,0.542843,0.065713,8.260844,2.220446e-16
3,Environmental Practices,Substituting Environmentally Preferable Direct...,0.646634,0.07421,8.713526,0.0
4,Equipment Layout,Equipment Layout - Low Inventories,0.736921,0.052824,13.950627,0.0
5,Equipment Layout,Equipment Layout - Minimized Handling,0.757169,0.056553,13.3886,0.0
6,Equipment Layout,Equipment Layout - JIT Production,0.61503,0.060083,10.236411,0.0
7,JIT & Kanban,JIT Delivery by Suppliers - Timely Delivery,0.515452,0.070973,7.262661,3.794742e-13
8,JIT & Kanban,JIT Delivery by Suppliers - Daily Shipments,0.294416,0.07977,3.69083,0.0002235239
9,JIT & Kanban,JIT Delivery by Suppliers - Pull System,0.460002,0.078842,5.834441,5.397123e-09


In [38]:
significant_results = results_df[results_df['p-value'] < 0.01]
significant_results

                     Bundle  \
0   Environmental Practices   
1   Environmental Practices   
2   Environmental Practices   
3   Environmental Practices   
4          Equipment Layout   
5          Equipment Layout   
6          Equipment Layout   
7              JIT & Kanban   
8              JIT & Kanban   
9              JIT & Kanban   
10             JIT & Kanban   
11             JIT & Kanban   
12             JIT & Kanban   

                                     Item description   Loading        SE  \
0                Environmentally Preferable Packaging  0.447683  0.059621   
1   Third Party Monitoring of Supplier Working Con...  0.540919  0.082463   
2                                    Water Efficiency  0.542843  0.065713   
3   Substituting Environmentally Preferable Direct...  0.646634  0.074210   
4                  Equipment Layout - Low Inventories  0.736921  0.052824   
5               Equipment Layout - Minimized Handling  0.757169  0.056553   
6                   Equipm