In [None]:
import os
import pandas as pd
import numpy as np
from uuid import uuid4

from src import dgp, dim_red, dea, eval

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
run_serial = str(uuid4())

In [None]:
parent_dir = os.getcwd()
results_dir = os.path.join(parent_dir, 'results')

## Set up initial variables

In [None]:
N = 100
M = 1
n = 500
alpha_1 = 0.25
gamma = 1
sigma_u = 0.5
rts='crs'
orientation='input'
nr_simulations = 50
seed = 42
pca = False

In [None]:
np.random.seed(seed)

In [None]:
print('INITIAL SETUP \n')
print(f'Number of inputs: {N}')
print(f'Number of outputs: {M}')
print(f'Number of DMUs: {n}')
print(f'Parameter alpha_1: {alpha_1}')
print(f'Parameter gamma: {gamma}')
print(f'Parameter sigma_u: {sigma_u}')
print(f'Return to scale: {rts}')
print(f'Orientation: {orientation}')
print(f'Number of simulations: {nr_simulations}')
print(f'Seed: {seed}')
print(f'PCA: {pca}')

In [None]:
params_dict = {
    'N': N,
    'M': M,
    'n': n,
    'alpha_1': alpha_1,
    'gamma': gamma,
    'sigma_u': sigma_u,
    'rts': rts,
    'orientation': orientation,
    'nr_simulations': nr_simulations,
    'seed': seed,
    'pca': pca,
}

## Data Generating Process

In [None]:
data_dict = dgp.generate_data_dict(
    n=n,
    N=N,
    M=M,
    alpha_1=alpha_1,
    gamma=gamma,
    sigma_u=sigma_u,
    verbose=False
)
x = data_dict["x"]
y = data_dict["y"]
y_tilde = data_dict["y_tilde"]
efficiency_score_by_design = (y/y_tilde).squeeze()

## Dimensionality reduction

In [None]:
embeddings = dim_red.create_embeddings(x=x, seed=seed)
embeddings_df_dict = embeddings['embeddings_df_dict']
dims_for_embedding_dict = embeddings['dims_for_embedding_dict']

## Calculate DEA

In [None]:
efficiency_scores_dict = dea.calculate_dea_for_embeddings(
    embeddings_df_dict=embeddings_df_dict,
    y=y,
    rts=rts,
    orientation=orientation
)

## Evaluate the results

In [None]:
evaluation_df = eval.create_evaluation_df(
    efficiency_scores_dict=efficiency_scores_dict,
    efficiency_score_by_design=efficiency_score_by_design,
    dims_for_embedding_dict=dims_for_embedding_dict,
)

In [None]:
evaluation_df

## Sanity checks

In [None]:
import matplotlib.pyplot as plt

In [None]:
alpha = data_dict["alpha"]

In [None]:
len([np.linalg.norm(x_row) for x_row in x])

In [None]:
plt.boxplot(alpha)

In [None]:
[np.linalg.norm(x_row) for x_row in x][400]

In [None]:
plt.boxplot([np.linalg.norm(x_row) for x_row in x])

In [None]:
plt.hist(
    [np.linalg.norm(x_row) for x_row in x],
    bins=50
)

In [None]:
plt.boxplot(y)

In [None]:
plt.hist(
    y,
    bins=50
)

In [None]:
plt.boxplot(y_tilde)

In [None]:
plt.hist(
    y_tilde,
    bins=50
)

In [None]:
efficiencies_df = pd.DataFrame.from_dict(efficiency_scores_dict)

In [None]:
efficiencies_df['theoretical_eff'] = efficiency_score_by_design

In [None]:
plt.hist(efficiencies_df['theoretical_eff'], bins=50)

In [None]:
efficiencies_df[efficiencies_df.isna().any(axis=1)].sort_values(by='theoretical_eff', ascending=False)

In [None]:
evaluation_df

In [None]:
def nan_pearsonr(x, y):
    return pd.DataFrame({'x': x, 'y': y}).dropna().corr().iloc[0, 1]

In [None]:
nan_pearsonr(efficiency_scores_dict['original'], efficiency_score_by_design)