In [None]:
# %pip install -r requirements.txt

In [None]:
import os
import pandas as pd
import numpy as np

import data_generating_process as dgp
import dimensionality_reduction as dr

In [None]:
parent_dir = '/Users/juanmalagon/repos/high-dim-dea'
mc_simulation_dir = os.path.join(parent_dir, 'legacy/mc_simulation')
dea_results_dir = os.path.join(parent_dir, 'legacy/dea_results')

## Data Generating Process

In [None]:
seed = 42
np.random.seed(seed)

In [None]:
N = 100
M = 1
n = 50
alpha_1 = 0.25
gamma = 1
sigma_u = 0.5
epsilon = 0.5

In [None]:
print('INITIAL SETUP \n')
print(f'Number of inputs: {N}')
print(f'Number of outputs: {M}')
print(f'Number of DMUs: {n}')
print(f'Parameter alpha_1: {alpha_1}')
print(f'Parameter gamma: {gamma}')
print(f'Parameter sigma_u: {sigma_u}')
print(f'Parameter epsilon: {epsilon}')

In [None]:
data_dict = dgp.generate_data_dict(
    n=n,
    N=N,
    M=M,
    alpha_1=alpha_1,
    gamma=gamma,
    sigma_u=sigma_u,
    verbose=False
)

In [None]:
x = data_dict["x"]
y = data_dict["y"]
y_tilde = data_dict["y_tilde"]
eff_ratio_bound = y/y_tilde

## Dimensionality reduction

In [None]:
embeddings = dr.create_embeddings(x=x, seed=seed)

In [None]:
embeddings_df_dict = embeddings['embeddings_df_dict']
dims_for_embedding_dict = embeddings['dims_for_embedding_dict']

In [None]:
import umap
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
def get_dims_for_embedding(x):
    dims_for_embedding_dict = {
        'half': int(x.shape[1]/2),
        'sqrt': int(np.sqrt(x.shape[1])),
        'log': int(np.log(x.shape[1])),
        'ten_percent': int(x.shape[1]*0.1),
    }
    # Correcting for spectral initialization in case
    # the number of dimensions is not less than x.shape[0]
    for k, v in dims_for_embedding_dict.items():
        if v >= x.shape[0]:
            dims_for_embedding_dict[k] = x.shape[0] - 2
    return dims_for_embedding_dict

In [None]:
def reduce_dims(
        x,
        n_components=2,
        n_neighbors=15,
        min_dist=0.1,
        metric='euclidean'):
    fit = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric,
        random_state=seed
    )
    u = fit.fit_transform(x)
    # Linear translation to make all values non-negative
    if (u < 0).any(): # type: ignore
        u = u - u.min() # type: ignore
    print(f'Shape of the embedding: {u.shape}') # type: ignore
    return u

In [None]:
def create_embeddings(x):
    print(f'Original shape: {x.shape}')
    dims_for_embedding_dict = get_dims_for_embedding(x)
    embeddings_dict = {}
    for k, v in dims_for_embedding_dict.items():
        print(f'Creating embedding with {v} dimensions ({k})')
        embeddings_dict[k] = reduce_dims(x, n_components=v)
    embeddings_dict['original'] = x
    return embeddings_dict

In [None]:
embeddings_dict = create_embeddings(x)

## Calculate DEA CRS IO (dealib)

In [None]:
# %pip install dealib

In [None]:
from dealib.dea import RTS, Orientation, dea

In [None]:
efficiency_scores_dict = {}

In [None]:
for k, v in embeddings_df_dict.items():
    print(f'Calculating efficiency scores for {k} embedding')
    efficiency_scores_dict[k] = dea(
        v,
        y,
        rts=RTS.crs,
        orientation=Orientation.input,
    ).eff

In [None]:
comparative_df = pd.DataFrame(efficiency_scores_dict)

In [None]:
comparative_df['y'] = data_dict['y'].squeeze()
comparative_df['y_tilde'] = data_dict['y_tilde'].squeeze()
comparative_df['eff_ratio_bound'] = eff_ratio_bound.squeeze()

In [None]:
comparative_df['eff_ratio_bound_rank'] = comparative_df['eff_ratio_bound'].rank(ascending=False)
comparative_df['half_rank'] = comparative_df['half'].rank(ascending=False)

In [None]:
comparative_df

## Evaluate the results

In [None]:
from sklearn.metrics import mean_absolute_error, top_k_accuracy_score

In [None]:
mae_dict = {}

In [None]:
for k, v in efficiency_scores_dict.items():
    mae_dict[k] = mean_absolute_error(
        y_true=eff_ratio_bound,
        y_pred=v
    )

In [None]:
mae_dict

In [None]:
mean_absolute_error(
    y_pred=comparative_df['original'],
    y_true=comparative_df['eff_ratio_bound']
)

In [None]:
top_k_accuracy_score(
    y_true=comparative_df['eff_ratio_bound_rank'],
    y_score=comparative_df['half'],
    labels=comparative_df['eff_ratio_bound_rank'].to_list(),
    k=5
)

## Export results

In [None]:
# pd.DataFrame(data_dict['x']).to_csv("legacy/mc_simulation/inputs.csv", index=True)
# pd.DataFrame(data_dict['y']).to_csv("legacy/mc_simulation/outputs.csv", index=False)

In [None]:
pd.DataFrame(x).to_csv("mc_simulation/inputs.csv", index=True)

In [None]:
pd.DataFrame(y).to_csv("mc_simulation/outputs.csv", index=False)

In [None]:
fit = umap.UMAP(
    n_components=40,
)
u = fit.fit_transform(x)

In [None]:
pd.DataFrame(u).to_csv("mc_simulation/inputs_umap_40_dims.csv", index=True)