In [16]:
# -----------------------------------------------------------------------------
# Created on Thu Oct 14 10:55:36 2021
# @author: Hamed Soleimani
# The University of Melbourne
# -----------------------------------------------------------------------------
import numpy as np
import pandas as pd
import xlwt
import os
import glob
import time
from collections import Counter

start = time.time()

# -----------------------------------------------------------------------------
# Entropy of the ideal portfolio
# -----------------------------------------------------------------------------
def hbest(nalgos):
    return -np.log(1/nalgos)

# -----------------------------------------------------------------------------
# Entropy function
# -----------------------------------------------------------------------------
def getEntropy(dataTest,portfolio,epsilon):
    ninst = len(dataTest)
    nalgos = len(portfolio)
    Ybin = np.zeros((ninst,nalgos))
    for i in range(ninst):
        tmp_min = np.min(dataTest.iloc[i])

        for j in range(nalgos):
            if dataTest.iloc[i,j]==tmp_min:
                Ybin[i,j] = 1
            elif tmp_min!=0 and ((dataTest.iloc[i,j]-tmp_min)/tmp_min)<=epsilon:
                Ybin[i,j] = 1
    
    Pgood = np.mean(Ybin, axis=0)
    aux = Pgood*np.log(Pgood)
    aux[np.isnan(aux)] = 0
    return -np.sum(aux)

# -----------------------------------------------------------------------------
# Regret function
# -----------------------------------------------------------------------------
def getRegret(dataTest,portfolio):
    eps = np.finfo(float).eps
    ninst = len(dataTest)
    regret=0
    for i in range(0,ninst):
        aux = 0
        aux = np.min(dataTest.iloc[i,:][(int(s) for s in portfolio)])

        if np.min(dataTest.iloc[i,:])==0:
            aux = aux/eps
        else:
            aux = aux/np.min(dataTest.iloc[i,:]) #OR test_no_CV

        regret = max(regret, aux)
        
    return regret

# -----------------------------------------------------------------------------
# TopN function
# -----------------------------------------------------------------------------
def TopN(data, maxCardinality):
    ninst = data.shape[0]
    Topi = np.zeros(ninst)
    for i in range(0,ninst):
        Topi[i] = np.argmin(data.iloc[i,:])
    
    Topi_best = Counter(Topi).most_common()
    Topi_N = [t[0] for t in Topi_best]
    return Topi_N[0:maxCardinality]

# -----------------------------------------------------------------------------
# Main script
# -----------------------------------------------------------------------------
# rootdir = './'
rootdir = 'C:/Users/mariom1/gitrepos/Algorithm_Portfolio/'
filelist = glob.glob(rootdir+'aslib/*.csv') #Reading metadata csv file in the folder (root folder)
nfolds = 10
epsilon = [0.00,0.05,0.10,0.20,0.50,1.00,2.00] #becasue of Shanon calculation
wb = xlwt.Workbook()

for f in filelist:
    crow = 0
    scenario_name = str(f)
    scenario_name = f[f.find("metadata_")+9:-4]
    ws = wb.add_sheet(scenario_name)
    ws.write(crow, 0, 'Epsilon')
    ws.write(crow, 1, 'Cardinality')
    ws.write(crow, 2, 'CV_mean_regret')
    ws.write(crow, 3, 'CV_std_regret')
    ws.write(crow, 4, 'CV_mean_entropy')
    ws.write(crow, 5, 'CV_std_entropy')

    metadata = pd.read_csv(f)

    ninst = len(metadata) # number of the instances

    algorithms_df = pd.DataFrame(metadata.loc[:,['algo' in ii for ii in metadata.columns]])  #the columns contain "algo" inside

    algolabels = algorithms_df.columns
    nalgos = len(algolabels) # number of the algorithms

    for ii in range(6,nalgos+6):
        ws.write(crow, ii, algolabels[ii-6])

    p = algorithms_df.copy()
    p = p.dropna(axis=0, how='all')     #if all of the value of the algo are NaN, that instance has to be removed
    p = p.reset_index(drop=True)
    p = p.fillna(1e9)

    # cross validation loop:
    df_cv = pd.read_csv(rootdir+'CV/CV_metadata_'+scenario_name+'.csv')
    metadata_cv = p.copy()
    
    for K in range(1,nalgos+1):
        regret = np.zeros(nfolds)
        entropy = np.zeros((nfolds,len(epsilon)))
        selected = np.zeros(nalgos)

        for c in range(1,nfolds+1):
            test = metadata_cv.loc[(df_cv['CV']==c)]
            train = metadata_cv.loc[(df_cv['CV']!=c)]
            
            n_train = len(train)
            n_test = len(test)
            
            portfolio = TopN(train,K)
            
            regret[c-1] = getRegret(test, portfolio)
            tmp_index_TOPK = []
            for i in portfolio:
                tmp_index_TOPK.append(int(i))

            selected[tmp_index_TOPK] = selected[tmp_index_TOPK]+1
            
            for ii in range(0,len(epsilon)):
                if K!=1:
                    entropy[c-1,ii] = getEntropy(test, portfolio, epsilon[ii])/hbest(K)
                else:
                    entropy[c-1,ii] = np.nan
            
        cv_mean_regret = np.mean(regret)
        cv_std_regret = np.std(regret)
        cv_mean_entropy = np.mean(entropy, axis=0)
        cv_std_entropy = np.mean(entropy, axis=0)
        pcent_selected = selected/nfolds
            
        for ii in range(0,len(epsilon)):
            crow = crow+1
            ws.write(crow, 0, epsilon[ii])
            ws.write(crow, 1, K)
            ws.write(crow, 2, cv_mean_regret)
            ws.write(crow, 3, cv_std_regret)
            ws.write(crow, 4, cv_mean_entropy[ii])
            ws.write(crow, 5, cv_std_entropy[ii])
            for jj in range(6,nalgos+6):
                ws.write(crow, jj, pcent_selected[jj-6])
    
    
wb.save('TopK.xls')
    
process_time=time.time()-start
print("Time in min=",process_time/3660)





Time in min= 2.218223877422145
