In [None]:
import os
import pandas as pd
import numpy as np
import json
from uuid import uuid4

import data_generating_process as dgp
import dimensionality_reduction as dr
import calculate_dea as cdea
import evaluate_results as er

In [None]:
run_serial = str(uuid4())

In [None]:
parent_dir = '/Users/juanmalagon/repos/high-dim-dea'
results_dir = os.path.join(parent_dir, 'results')

In [None]:
## Set up initial variables

N = 100
M = 1
n = 200
alpha_1 = 1/N
gamma = 1
sigma_u = 0.5
rts='crs'
orientation='input'
nr_simulations = 1_000
seed = 42

In [None]:
np.random.seed(seed)

In [None]:
print('INITIAL SETUP \n')
print(f'Number of inputs: {N}')
print(f'Number of outputs: {M}')
print(f'Number of DMUs: {n}')
print(f'Parameter alpha_1: {alpha_1}')
print(f'Parameter gamma: {gamma}')
print(f'Parameter sigma_u: {sigma_u}')
print(f'Return to scale: {rts}')
print(f'Orientation: {orientation}')
print(f'Number of simulations: {nr_simulations}')
print(f'Seed: {seed}')

In [None]:
params_dict = {
    'N': N,
    'M': M,
    'n': n,
    'alpha_1': alpha_1,
    'gamma': gamma,
    'sigma_u': sigma_u,
    'rts': rts,
    'orientation': orientation,
    'nr_simulations': nr_simulations,
    'seed': seed
}

In [None]:
def run_simulation(params_dict):

    N = params_dict['N']
    M = params_dict['M']
    n = params_dict['n']
    alpha_1 = params_dict['alpha_1']
    gamma = params_dict['gamma']
    sigma_u = params_dict['sigma_u']
    rts = params_dict['rts']
    orientation = params_dict['orientation']
    seed = params_dict['seed']

    ## Data Generating Process
    data_dict = dgp.generate_data_dict(
        n=n,
        N=N,
        M=M,
        alpha_1=alpha_1,
        gamma=gamma,
        sigma_u=sigma_u,
        verbose=False
    )
    x = data_dict["x"]
    y = data_dict["y"]
    y_tilde = data_dict["y_tilde"]
    efficiency_score_by_design = (y/y_tilde).squeeze()

    ## Dimensionality Reduction
    embeddings = dr.create_embeddings(x=x, seed=seed)
    embeddings_df_dict = embeddings['embeddings_df_dict']
    dims_for_embedding_dict = embeddings['dims_for_embedding_dict']

    ## Calculate DEA
    efficiency_scores_dict = cdea.calculate_dea_for_embeddings(
        embeddings_df_dict=embeddings_df_dict,
        y=y,
        rts=rts,
        orientation=orientation
    )

    ## Evaluate Results
    evaluation_df = er.create_evaluation_df(
        efficiency_scores_dict=efficiency_scores_dict,
        efficiency_score_by_design=efficiency_score_by_design,
        dims_for_embedding_dict=dims_for_embedding_dict,
    )

    return evaluation_df

In [None]:
evaluation_df_list = []
errors_list = []

for i in range(nr_simulations):
    print(f'Iteration {i}')
    try:
        evaluation_df = run_simulation(params_dict)
        evaluation_df['iteration'] = i
        evaluation_df_list.append(evaluation_df)
    except:
        errors_list.append(i)
        print(f'Error in iteration {i}')

In [None]:
evaluation_df_list

In [None]:
evaluation_df = pd.concat(evaluation_df_list)
errors_list_df = pd.DataFrame(errors_list, columns=['iteration'])

In [None]:
evaluation_df.to_csv(os.path.join(results_dir, f'evaluation_df_{run_serial}.csv'), index=True)
with open(os.path.join(results_dir, f'params_dict_{run_serial}.json'), 'w') as fp:
    json.dump(params_dict, fp)
errors_list_df.to_csv(os.path.join(results_dir, f'errors_list_{run_serial}.csv'), index=False)

In [None]:
summary_df = evaluation_df.groupby('dims').agg(
    {'mae': ['mean', 'std'],
     'spearmanr': ['mean', 'std'],
        'pearsonr': ['mean', 'std'],
        'kendalltau': ['mean', 'std']}).reset_index()

In [None]:
summary_df.columns = ['dims', 'mae_mean', 'mae_std', 'spearmanr_mean', 'spearmanr_std',
                        'pearsonr_mean', 'pearsonr_std', 'kendalltau_mean', 'kendalltau_std']

In [None]:
summary_df.to_csv(os.path.join(results_dir, f'summary_df_{run_serial}.csv'), index=False)