In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import f_oneway

current_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(current_dir))
PROCESSED_DATA_PATH = os.path.join(project_root, 'data', 'processed')
RAW_DATA_PATH = os.path.join(project_root, 'data', 'raw')
OUTPUT_PATH = os.path.join(project_root, 'output')

In [2]:
df = pd.read_csv(PROCESSED_DATA_PATH + "/dummy.csv")

In [3]:
df

Unnamed: 0,COUNTRY,COMPANY CODE,INDUSTRY,ENVRTX21,ENVRTX37,ENVRTX02,ENVRTX22,ENVRTX39,ENVRTX23,ENVRTX18,...,JITDELN01,JITDELN02,JITDELN03,KANBANN01,KANBANN02,KANBANN03,ACCTGX51,JIT,Environmental,Category
0,BRA,1704,3,2.0,4.0,2.0,2.0,3.0,4.0,4.0,...,1.0,1.0,2.0,1.0,3.0,3.0,247.0,Low,Low,Low JIT & Environmental
1,BRA,1709,3,4.0,2.0,3.0,1.0,3.0,3.0,2.0,...,3.0,4.0,2.0,1.0,2.0,2.0,350.0,High,Low,Mainly JIT
2,BRA,1713,1,3.0,2.0,3.0,1.0,2.0,4.0,2.0,...,1.0,1.0,3.0,1.0,5.0,4.0,1635.0,High,Low,Mainly JIT
3,BRA,1717,2,3.0,4.0,5.0,2.0,4.0,5.0,4.0,...,3.0,1.0,2.0,3.0,3.0,3.0,1934.0,Low,High,Mainly Environmental
4,BRA,1719,3,4.0,2.0,4.0,3.0,3.0,4.0,3.0,...,2.0,4.0,4.0,4.0,1.0,1.0,865.0,Low,Low,Low JIT & Environmental
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,USA,110,2,2.0,1.5,3.0,2.5,2.5,3.5,3.5,...,4.5,4.5,4.0,4.0,3.0,3.0,849.0,High,Low,Mainly JIT
178,USA,111,1,2.0,1.0,3.0,2.0,1.0,4.0,2.0,...,4.0,4.0,4.0,2.5,2.5,2.5,738.0,High,Low,Mainly JIT
179,SWI,1805,2,4.0,2.0,3.0,4.0,5.0,4.0,3.0,...,2.0,1.0,1.0,1.0,1.0,1.0,56.0,Low,Low,Low JIT & Environmental
180,SWI,1809,3,3.0,2.0,3.0,4.0,3.0,5.0,3.0,...,4.0,2.0,5.0,3.0,3.0,2.0,286.0,High,Low,Mainly JIT


In [4]:
# Calculate the mean of performance (based on columns starting with "EPERFX")
eperfx_columns = [col for col in df.columns if col.startswith("EPERFX")]
df['Performance_Mean'] = df[eperfx_columns].mean(axis=1)

# Calculate the frequency of each category
category_counts = df['Category'].value_counts().reset_index()
category_counts.columns = ['Category', 'Frequency']

# Calculate the percentage of each category
total_rows = len(df)
category_counts['Percentage'] = (category_counts['Frequency'] / total_rows) * 100

# Aggregate data for each category
result_df = pd.merge(category_counts, df.groupby('Category')['Performance_Mean'].mean().reset_index(), on='Category')

# Rename columns for clarity
result_df.columns = ['Category', 'Frequency', 'Percentage', 'Mean of Performance']

# Sort by mean performance in descending order
result_df = result_df.sort_values(by='Mean of Performance', ascending=False)

# Reset the index
result_df = result_df.reset_index(drop=True)

# Print or work with 'result_df' as needed
print(result_df)


                   Category  Frequency  Percentage  Mean of Performance
0  High JIT & Environmental         59   32.417582             3.924670
1      Mainly Environmental         39   21.428571             3.764957
2                Mainly JIT         43   23.626374             3.585271
3   Low JIT & Environmental         41   22.527473             3.280939


In [5]:
# Step 1: Perform ANOVA to check for overall differences
anova_result = f_oneway(df[df['Category'] == 'High JIT & Environmental']['Performance_Mean'],
                        df[df['Category'] == 'Mainly Environmental']['Performance_Mean'],
                        df[df['Category'] == 'Mainly JIT']['Performance_Mean'],
                        df[df['Category'] == 'Low JIT & Environmental']['Performance_Mean'])

# Step 2: Check if ANOVA indicates significant differences
if anova_result.pvalue < 0.05:
    # Step 3: Perform Tukey's HSD test for pairwise comparisons
    tukey_results = pairwise_tukeyhsd(df['Performance_Mean'], df['Category'], alpha=0.05)

    # Display Tukey results
    print(tukey_results.summary())
else:
    print("No significant differences among categories based on ANOVA.")


                  Multiple Comparison of Means - Tukey HSD, FWER=0.05                  
         group1                   group2         meandiff p-adj   lower   upper  reject
---------------------------------------------------------------------------------------
High JIT & Environmental Low JIT & Environmental  -0.6437    0.0 -0.9115 -0.3759   True
High JIT & Environmental    Mainly Environmental  -0.1597 0.4255 -0.4316  0.1121  False
High JIT & Environmental              Mainly JIT  -0.3394 0.0057 -0.6035 -0.0753   True
 Low JIT & Environmental    Mainly Environmental    0.484 0.0002  0.1894  0.7786   True
 Low JIT & Environmental              Mainly JIT   0.3043 0.0334  0.0168  0.5919   True
    Mainly Environmental              Mainly JIT  -0.1797 0.3813  -0.471  0.1116  False
---------------------------------------------------------------------------------------


In [11]:
import statsmodels.api as sm

# Fit a linear regression model with "ACCTGX51" as a predictor
X = df[['ACCTGX51']]
X = sm.add_constant(X)  # Add a constant term (intercept)
y = df['Performance_Mean']

model = sm.OLS(y, X).fit()

# Print regression summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:       Performance_Mean   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.036
Method:                 Least Squares   F-statistic:                     7.821
Date:                Sun, 12 Nov 2023   Prob (F-statistic):            0.00573
Time:                        23:40:58   Log-Likelihood:                -148.10
No. Observations:                 182   AIC:                             300.2
Df Residuals:                     180   BIC:                             306.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.6048      0.046     78.225      0.0

In [12]:
import statsmodels.api as sm

# Fit an ANCOVA model
model = sm.OLS.from_formula('Performance_Mean ~ C(Category) + ACCTGX51', data=df).fit()

# Perform ANCOVA
anova_results = sm.stats.anova_lm(model, typ=2)

# Check for significance
if anova_results['PR(>F)']['C(Category)'] < 0.05:
    # If the categorical variable is significant, you can proceed with post hoc tests.
    # Otherwise, you may conclude there are no significant differences.
    tukey_results = pairwise_tukeyhsd(df['Performance_Mean'], df['Category'], alpha=0.05)
    print(tukey_results.summary())
else:
    print("No significant differences among categories based on ANCOVA.")


                  Multiple Comparison of Means - Tukey HSD, FWER=0.05                  
         group1                   group2         meandiff p-adj   lower   upper  reject
---------------------------------------------------------------------------------------
High JIT & Environmental Low JIT & Environmental  -0.6437    0.0 -0.9115 -0.3759   True
High JIT & Environmental    Mainly Environmental  -0.1597 0.4255 -0.4316  0.1121  False
High JIT & Environmental              Mainly JIT  -0.3394 0.0057 -0.6035 -0.0753   True
 Low JIT & Environmental    Mainly Environmental    0.484 0.0002  0.1894  0.7786   True
 Low JIT & Environmental              Mainly JIT   0.3043 0.0334  0.0168  0.5919   True
    Mainly Environmental              Mainly JIT  -0.1797 0.3813  -0.471  0.1116  False
---------------------------------------------------------------------------------------


In [18]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Fit ANCOVA model with ACCTGX51 as a covariate
formula = 'Performance_Mean ~ Category + ACCTGX51'
model = ols(formula, data=df).fit()

# Perform ANOVA
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

# Perform pairwise comparisons with Bonferroni correction
pairwise_comparisons = model.t_test_pairwise('Category').result_frame
pairwise_comparisons

             sum_sq     df          F        PR(>F)
Category   8.909853    3.0  11.594458  5.643718e-07
ACCTGX51   0.578032    1.0   2.256591  1.348286e-01
Residual  45.339019  177.0        NaN           NaN


Unnamed: 0,coef,std err,t,P>|t|,Conf. Int. Low,Conf. Int. Upp.,pvalue-hs,reject-hs
Low JIT & Environmental-High JIT & Environmental,-0.603839,0.106275,-5.681841,5.387896e-08,-0.813568,-0.394109,3.232737e-07,True
Mainly Environmental-High JIT & Environmental,-0.131999,0.106066,-1.244499,0.2149612,-0.341315,0.077318,0.2314004,False
Mainly JIT-High JIT & Environmental,-0.305414,0.103973,-2.937428,0.003749956,-0.510601,-0.100227,0.01491566,True
Mainly Environmental-Low JIT & Environmental,0.47184,0.113496,4.157322,5.009402e-05,0.24786,0.69582,0.000250445,True
Mainly JIT-Low JIT & Environmental,0.298425,0.110545,2.699585,0.007615885,0.08027,0.51658,0.02267409,True
Mainly JIT-Mainly Environmental,-0.173415,0.111993,-1.548443,0.1233019,-0.394429,0.047599,0.2314004,False
