In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import f_oneway

current_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(current_dir))
PROCESSED_DATA_PATH = os.path.join(project_root, 'data', 'processed')
RAW_DATA_PATH = os.path.join(project_root, 'data', 'raw')
OUTPUT_PATH = os.path.join(project_root, 'output')

In [2]:
df = pd.read_csv(PROCESSED_DATA_PATH + "/dummy.csv")

In [3]:
df

Unnamed: 0,COUNTRY,COMPANY CODE,INDUSTRY,ENVRTX21,ENVRTX37,ENVRTX02,ENVRTX22,ENVRTX39,ENVRTX23,ENVRTX18,...,OUTCMX11,OUTCMX12,OUTCMX13,OUTCMX14,OUTCMX15,OUTCMX16,OUTCMX17,JIT,Environmental,Category
0,BRA,1704,3,2.000000,4.0,2.000000,2.000000,3.0,4.000000,4.000000,...,3.0,2.000000,2.000000,3.000000,3.000000,3.000000,3.000000,Low,Low,Low JIT & Environmental
1,BRA,1713,1,3.000000,2.0,3.000000,1.000000,2.0,4.000000,2.000000,...,3.0,4.000000,1.000000,5.000000,4.000000,4.000000,4.000000,High,Low,Mainly JIT
2,BRA,1717,2,3.000000,4.0,5.000000,2.000000,4.0,5.000000,4.000000,...,3.0,5.000000,5.000000,5.000000,5.000000,5.000000,5.000000,Low,High,Mainly Environmental
3,BRA,1719,3,4.000000,2.0,4.000000,3.000000,3.0,4.000000,3.000000,...,3.0,4.000000,5.000000,5.000000,5.000000,4.000000,4.000000,Low,Low,Low JIT & Environmental
4,GER,401,2,4.666667,2.0,4.333333,4.333333,3.0,4.666667,3.333333,...,3.0,3.666667,3.666667,3.666667,3.666667,4.333333,4.333333,Low,Low,Low JIT & Environmental
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,USA,109,2,3.500000,2.5,3.500000,2.500000,3.5,4.000000,4.000000,...,3.0,4.000000,3.500000,4.500000,5.000000,4.000000,4.500000,Low,High,Mainly Environmental
171,USA,110,2,2.000000,1.5,3.000000,2.500000,2.5,3.500000,3.500000,...,2.5,4.000000,4.500000,4.000000,4.000000,4.500000,4.000000,High,Low,Mainly JIT
172,USA,111,1,2.000000,1.0,3.000000,2.000000,1.0,4.000000,2.000000,...,3.0,3.000000,3.000000,3.000000,4.000000,3.000000,4.000000,High,Low,Mainly JIT
173,SWI,1809,3,3.000000,2.0,3.000000,4.000000,3.0,5.000000,3.000000,...,3.0,4.000000,4.000000,4.000000,5.000000,5.000000,5.000000,High,Low,Mainly JIT


In [4]:
# Calculate the mean of performance (based on columns starting with "EPERFX")
eperfx_columns = [col for col in df.columns if col.startswith("EPERFX")]
df['Performance_Mean'] = df[eperfx_columns].mean(axis=1)

# Calculate the frequency of each category
category_counts = df['Category'].value_counts().reset_index()
category_counts.columns = ['Category', 'Frequency']

# Calculate the percentage of each category
total_rows = len(df)
category_counts['Percentage'] = (category_counts['Frequency'] / total_rows) * 100

# Aggregate data for each category
result_df = pd.merge(category_counts, df.groupby('Category')['Performance_Mean'].mean().reset_index(), on='Category')

# Rename columns for clarity
result_df.columns = ['Category', 'Frequency', 'Percentage', 'Mean of Performance']

# Sort by mean performance in descending order
result_df = result_df.sort_values(by='Mean of Performance', ascending=False)

# Reset the index
result_df = result_df.reset_index(drop=True)

# Print or work with 'result_df' as needed
print(result_df)


                   Category  Frequency  Percentage  Mean of Performance
0  High JIT & Environmental         57   32.571429             3.913255
1      Mainly Environmental         38   21.714286             3.779240
2                Mainly JIT         40   22.857143             3.579167
3   Low JIT & Environmental         40   22.857143             3.282407


In [5]:
# Step 1: Perform ANOVA to check for overall differences
anova_result = f_oneway(df[df['Category'] == 'High JIT & Environmental']['Performance_Mean'],
                        df[df['Category'] == 'Mainly Environmental']['Performance_Mean'],
                        df[df['Category'] == 'Mainly JIT']['Performance_Mean'],
                        df[df['Category'] == 'Low JIT & Environmental']['Performance_Mean'])

# Step 2: Check if ANOVA indicates significant differences
if anova_result.pvalue < 0.05:
    # Step 3: Perform Tukey's HSD test for pairwise comparisons
    tukey_results = pairwise_tukeyhsd(df['Performance_Mean'], df['Category'], alpha=0.05)

    # Display Tukey results
    print(tukey_results.summary())
else:
    print("No significant differences among categories based on ANOVA.")


                  Multiple Comparison of Means - Tukey HSD, FWER=0.05                  
         group1                   group2         meandiff p-adj   lower   upper  reject
---------------------------------------------------------------------------------------
High JIT & Environmental Low JIT & Environmental  -0.6308    0.0 -0.9028 -0.3589   True
High JIT & Environmental    Mainly Environmental   -0.134   0.59 -0.4101  0.1421  False
High JIT & Environmental              Mainly JIT  -0.3341 0.0092  -0.606 -0.0622   True
 Low JIT & Environmental    Mainly Environmental   0.4968 0.0002  0.1982  0.7955   True
 Low JIT & Environmental              Mainly JIT   0.2968 0.0478   0.002  0.5915   True
    Mainly Environmental              Mainly JIT  -0.2001 0.3072 -0.4987  0.0986  False
---------------------------------------------------------------------------------------


In [6]:
import statsmodels.api as sm

# Fit a linear regression model with "ACCTGX51" as a predictor
X = df[['ACCTGX51']]
X = sm.add_constant(X)  # Add a constant term (intercept)
y = df['Performance_Mean']

model = sm.OLS(y, X).fit()

# Print regression summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:       Performance_Mean   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.036
Method:                 Least Squares   F-statistic:                     7.444
Date:                Mon, 13 Nov 2023   Prob (F-statistic):            0.00702
Time:                        18:41:40   Log-Likelihood:                -142.24
No. Observations:                 175   AIC:                             288.5
Df Residuals:                     173   BIC:                             294.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.6036      0.047     76.769      0.0

In [7]:
import statsmodels.api as sm

# Fit an ANCOVA model
model = sm.OLS.from_formula('Performance_Mean ~ C(Category) + ACCTGX51', data=df).fit()

# Perform ANCOVA
anova_results = sm.stats.anova_lm(model, typ=2)

# Check for significance
if anova_results['PR(>F)']['C(Category)'] < 0.05:
    # If the categorical variable is significant, you can proceed with post hoc tests.
    # Otherwise, you may conclude there are no significant differences.
    tukey_results = pairwise_tukeyhsd(df['Performance_Mean'], df['Category'], alpha=0.05)
    print(tukey_results.summary())
else:
    print("No significant differences among categories based on ANCOVA.")


                  Multiple Comparison of Means - Tukey HSD, FWER=0.05                  
         group1                   group2         meandiff p-adj   lower   upper  reject
---------------------------------------------------------------------------------------
High JIT & Environmental Low JIT & Environmental  -0.6308    0.0 -0.9028 -0.3589   True
High JIT & Environmental    Mainly Environmental   -0.134   0.59 -0.4101  0.1421  False
High JIT & Environmental              Mainly JIT  -0.3341 0.0092  -0.606 -0.0622   True
 Low JIT & Environmental    Mainly Environmental   0.4968 0.0002  0.1982  0.7955   True
 Low JIT & Environmental              Mainly JIT   0.2968 0.0478   0.002  0.5915   True
    Mainly Environmental              Mainly JIT  -0.2001 0.3072 -0.4987  0.0986  False
---------------------------------------------------------------------------------------


In [8]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Fit ANCOVA model with ACCTGX51 as a covariate
formula = 'Performance_Mean ~ Category + ACCTGX51'
model = ols(formula, data=df).fit()

# Perform ANOVA
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

# Perform pairwise comparisons with Bonferroni correction
pairwise_comparisons = model.t_test_pairwise('Category').result_frame
pairwise_comparisons

             sum_sq     df          F    PR(>F)
Category   8.490455    3.0  11.041432  0.000001
ACCTGX51   0.572382    1.0   2.233066  0.136939
Residual  43.574584  170.0        NaN       NaN


Unnamed: 0,coef,std err,t,P>|t|,Conf. Int. Low,Conf. Int. Upp.,pvalue-hs,reject-hs
Low JIT & Environmental-High JIT & Environmental,-0.590669,0.107832,-5.477661,1.5291e-07,-0.803532,-0.377806,9.174597e-07,True
Mainly Environmental-High JIT & Environmental,-0.106161,0.107655,-0.986124,0.3254736,-0.318674,0.106352,0.3254736,False
Mainly JIT-High JIT & Environmental,-0.299626,0.106943,-2.801742,0.005672615,-0.510733,-0.088519,0.02249812,True
Mainly Environmental-Low JIT & Environmental,0.484508,0.114984,4.213695,4.06943e-05,0.257527,0.711488,0.0002034549,True
Mainly JIT-Low JIT & Environmental,0.291043,0.113273,2.569403,0.01104596,0.067441,0.514645,0.03277318,True
Mainly JIT-Mainly Environmental,-0.193465,0.114773,-1.685627,0.09370218,-0.420029,0.033099,0.1786243,False
