# Importing Dependencies

In [1]:
from models.modelsFDH import FDH
from models.modelsDEA import DEA
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import eat

from utils.is_efficient import is_efficient
from utils.datainput import xlsx2matrix 
from scipy.stats import norm


from utils.li_test import li_test

# Defining functions

In [2]:
def stats(df):
    avg_eff = df['efficiency'].mean()
    min_eff = df['efficiency'].min()
    max_eff = df['efficiency'].max()
    std_dev_eff = df['efficiency'].std()
    num_eff = df['is_efficient'].sum()
    num_considered = len(df)

    result = {
        'avg': avg_eff,
        'min': min_eff,
        'max': max_eff,
        'std_dev': std_dev_eff,
        'num_eff': num_eff,
        'num_considered': num_considered
    }
    return result

def plot_comparison(model1, model2):
    df1, name1 = model1
    df2, name2 = model2
    
    stats1 = stats(df1)
    stats2 = stats(df2)
    
    eff1 = df1['efficiency']
    eff2 = df2['efficiency']
    
    ratio1 = stats1['num_eff'] / stats1['num_considered']
    ratio2 = stats2['num_eff'] / stats2['num_considered']
    
    print(f"{name1} - Efficiency ratio (num_eff / num_considered): {ratio1:.2f}")
    print(f"{name2} - Efficiency ratio (num_eff / num_considered): {ratio2:.2f}")
    
    plt.figure(figsize=(12, 6))

    sns.histplot(eff1, kde=False, color='blue', bins=15, stat='density', label=f'{name1} Efficiency', alpha=0.6)
    mean1, std_dev1 = eff1.mean(), eff1.std()
    x1 = np.linspace(mean1 - 3*std_dev1, mean1 + 3*std_dev1, 100)
    plt.plot(x1, norm.pdf(x1, mean1, std_dev1), color='blue', linestyle='--', label=f'{name1} Gaussian')

    sns.histplot(eff2, kde=False, color='green', bins=15, stat='density', label=f'{name2} Efficiency', alpha=0.6)
    mean2, std_dev2 = eff2.mean(), eff2.std()
    x2 = np.linspace(mean2 - 3*std_dev2, mean2 + 3*std_dev2, 100)
    plt.plot(x2, norm.pdf(x2, mean2, std_dev2), color='green', linestyle='--', label=f'{name2} Gaussian')

    plt.title(f"Efficiency Distribution Comparison Between {name1} and {name2}")
    plt.xlabel("Efficiency")
    plt.ylabel("Density")
    plt.legend()

    plt.show()



# initialization

In [3]:
l, m = xlsx2matrix("./cleaned.xlsx", 
                   ['K84', 'L84', 'M84', 'PK84', 'PL84', 'PM84'], 
                   ["Y1Z84", "Y2Z84"])

In [4]:
import pandas as pd
df = pd.read_excel('./cleaned.xlsx')

In [5]:
df

Unnamed: 0,K84,L84,M84,Y1Z84,Y2Z84,PK84,PL84,PM84
0,187731,384213,229662,1.938104,1.847826,2.345091,1.103349,7.659985
1,31594,90372,79242,0.510952,0.721553,1.809023,1.047630,3.940273
2,34224,81466,64813,2.124401,0.101785,0.740632,0.882267,5.859592
3,91947,504489,384891,11.554443,2.089281,0.907676,1.049255,8.869514
4,89611,219512,94920,3.757861,1.867387,1.371028,0.880047,3.569552
...,...,...,...,...,...,...,...,...
271,293274,698029,507115,17.445820,0.374599,2.109772,0.997413,8.540845
272,44720,149730,43686,0.403330,2.128676,0.613785,1.177201,6.478053
273,113231,233265,109133,0.480033,1.559554,2.022997,1.273960,5.300282
274,199501,581074,515592,19.887671,0.549471,0.512113,1.239626,8.127554


In [6]:
df.columns

Index(['K84', 'L84', 'M84', 'Y1Z84', 'Y2Z84', 'PK84', 'PL84', 'PM84'], dtype='object')

In [7]:
dea = DEA(l, m)
fdh = FDH(l, m)

In [8]:
df[['K84', 'L84', 'M84', 'PK84', 'PL84', 'PM84']]

Unnamed: 0,K84,L84,M84,PK84,PL84,PM84
0,187731,384213,229662,2.345091,1.103349,7.659985
1,31594,90372,79242,1.809023,1.047630,3.940273
2,34224,81466,64813,0.740632,0.882267,5.859592
3,91947,504489,384891,0.907676,1.049255,8.869514
4,89611,219512,94920,1.371028,0.880047,3.569552
...,...,...,...,...,...,...
271,293274,698029,507115,2.109772,0.997413,8.540845
272,44720,149730,43686,0.613785,1.177201,6.478053
273,113231,233265,109133,2.022997,1.273960,5.300282
274,199501,581074,515592,0.512113,1.239626,8.127554


In [9]:
df[['Y1Z84', 'Y2Z84']]

Unnamed: 0,Y1Z84,Y2Z84
0,1.938104,1.847826
1,0.510952,0.721553
2,2.124401,0.101785
3,11.554443,2.089281
4,3.757861,1.867387
...,...,...
271,17.445820,0.374599
272,0.403330,2.128676
273,0.480033,1.559554
274,19.887671,0.549471


In [10]:
x_cols = ['K84', 'L84', 'M84', 'PK84', 'PL84', 'PM84']
y_cols = ['Y1Z84', 'Y2Z84']
numStop = 5
fold = 5

In [11]:
ordered_cols = x_cols + y_cols
df_ordered = df[ordered_cols].copy()

In [12]:
df_ordered

Unnamed: 0,K84,L84,M84,PK84,PL84,PM84,Y1Z84,Y2Z84
0,187731,384213,229662,2.345091,1.103349,7.659985,1.938104,1.847826
1,31594,90372,79242,1.809023,1.047630,3.940273,0.510952,0.721553
2,34224,81466,64813,0.740632,0.882267,5.859592,2.124401,0.101785
3,91947,504489,384891,0.907676,1.049255,8.869514,11.554443,2.089281
4,89611,219512,94920,1.371028,0.880047,3.569552,3.757861,1.867387
...,...,...,...,...,...,...,...,...
271,293274,698029,507115,2.109772,0.997413,8.540845,17.445820,0.374599
272,44720,149730,43686,0.613785,1.177201,6.478053,0.403330,2.128676
273,113231,233265,109133,2.022997,1.273960,5.300282,0.480033,1.559554
274,199501,581074,515592,0.512113,1.239626,8.127554,19.887671,0.549471


In [13]:
model = eat.EAT(matrix=df_ordered, x=x_cols, y=y_cols, numStop=numStop, fold=fold)

In [14]:
model.fit()

In [15]:
mdl_scores = eat.Scores(matrix=df_ordered, x=x_cols, y=y_cols, tree=model.tree) # [cite: 173] uses dataset, x, y, model.tree

In [23]:
dfP = model.predict(df[['K84', 'L84', 'M84', 'PK84', 'PL84', 'PM84']], x_cols)
dfP

Unnamed: 0,K84,L84,M84,PK84,PL84,PM84,p_Y1Z84,p_Y2Z84
0,187731,384213,229662,2.345091,1.103349,7.659985,37.981529,25.895002
1,31594,90372,79242,1.809023,1.047630,3.940273,3.458725,5.493140
2,34224,81466,64813,0.740632,0.882267,5.859592,3.458725,5.493140
3,91947,504489,384891,0.907676,1.049255,8.869514,37.981529,25.895002
4,89611,219512,94920,1.371028,0.880047,3.569552,10.535495,5.962560
...,...,...,...,...,...,...,...,...
271,293274,698029,507115,2.109772,0.997413,8.540845,37.981529,25.895002
272,44720,149730,43686,0.613785,1.177201,6.478053,3.458725,5.493140
273,113231,233265,109133,2.022997,1.273960,5.300282,10.535495,5.962560
274,199501,581074,515592,0.512113,1.239626,8.127554,37.981529,25.895002


In [25]:
dfP['p_Y1Z84']/df['Y1Z84']

0      19.597260
1       6.769178
2       1.628094
3       3.287179
4       2.803588
         ...    
271     2.177113
272     8.575422
273    21.947438
274     1.909803
275     1.472041
Length: 276, dtype: float64

In [26]:
dfP['p_Y2Z84']/df['Y2Z84']

0      14.013767
1       7.612940
2      53.968072
3      12.394217
4       3.192996
         ...    
271    69.127258
272     2.580543
273     3.823247
274    47.127149
275    16.333186
Length: 276, dtype: float64