In [None]:
## Test Team Formation framework with real datasets
## Balancing Task Coverage vs. Maximum Expert Load
## Karan Vombatkere, Spring 2022

#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, time, random
import TeamFormationProblem as TFP

#### Freelancer Dataset

In [None]:
#Import Freelancer data
#Freelancer from DropBox link: https://www.dropbox.com/sh/8zpsi1etvvvvj5k/AAD-J9ZQmSsbnSmEILBMD9uxa/datasets/real?dl=0&subfolder_nav_tracking=1
#freelance_experts.csv and freelance_projects.csv

def extract_skills(row):
    skills = []
    for i,val in enumerate(row):
        if val == 1:
            skills.append(str(i))
    return skills            

    
def importFreelancerData(experts_filename='datasets/freelancer/freelancer_experts.csv', tasks_filename='datasets/freelancer/freelancer_projects.csv'):
    #Extract tasks skills as list
    freelance_tasks_df = pd.read_csv(tasks_filename, header=None)
    print("Freelancer tasks df shape: ", freelance_tasks_df.shape)
    freelance_tasks_df['Task_Skills'] = freelance_tasks_df.apply(lambda row: extract_skills(row), axis=1)
    task_skills_list = freelance_tasks_df.Task_Skills.to_list()
    
    #Extract experts skills as list
    freelance_experts_df = pd.read_csv(experts_filename, header=None)
    print("Freelancer experts df shape: ", freelance_experts_df.shape)
    freelance_experts_df['Expert_Skills'] = freelance_experts_df.apply(lambda row: extract_skills(row), axis=1)
    expert_skills_list = freelance_experts_df.Expert_Skills.to_list()

    print("Imported Freelancer dataset. Num Experts={}, Num Tasks={}".format(len(expert_skills_list),len(task_skills_list)))

    return task_skills_list, expert_skills_list
    

In [None]:
t,e = importFreelancerData()
FreelancerTest = TFP.TeamFormationProblem(t, e)

In [None]:
runtimeDict, F_vals, workLoad_vals = FreelancerTest.computeTaskAssigment(algorithms=['lazy_greedy', 'random', 'no_update_greedy','task_greedy'], lambdaVal=0.1)

In [None]:
taMatList, covLoadList_f = FreelancerTest.setCoverLPTaskCoverage(10)

In [None]:
lambda_val = 0.1
for r, vals in enumerate(covLoadList_f):
    print("Round {}: Coverage={:.2f}, Max_Load={}, Objective(F)={:.2f}".format(r+1, vals[0], vals[1], (lambda_val*vals[0] - vals[1])))

In [None]:
# for i in range(len(tamatlist)):
#     sumVal = 0
#     for arr in tamatlist[i]:
#         sumVal += sum(arr)
#     print(sumVal)

In [None]:
# freelancerCovList = FreelancerTest.getCoverageValues()

#### Guru Dataset

In [None]:
#Guru Dataset
def extract_skills_guru(row):
    skills = []
    for i,val in enumerate(row):
        if val == 1:
            skills.append(str(i))
    return skills 

def importGuruData(experts_filename='datasets/guru/guru_experts.csv', tasks_filename='datasets/guru/guru_tasks.csv'):
    #Extract tasks skills as list
    guru_tasks_df = pd.read_csv(tasks_filename, header=None)
    print("Guru tasks df shape: ", guru_tasks_df.shape)
    guru_tasks_df['Task_Skills'] = guru_tasks_df.apply(lambda row: extract_skills_guru(row), axis=1)
    task_skills_list = guru_tasks_df.Task_Skills.to_list()
    task_skills_list = task_skills_list[0:-1]
    
    #Extract experts skills as list
    guru_experts_df = pd.read_csv(experts_filename, header=None)
    print("Guru experts df shape: ", guru_experts_df.shape)
    guru_experts_df['Expert_Skills'] = guru_experts_df.apply(lambda row: extract_skills_guru(row), axis=1)
    expert_skills_list = guru_experts_df.Expert_Skills.to_list()
    expert_skills_list = expert_skills_list[0:-1]

    print("Imported Guru dataset. Num Experts={}, Num Tasks={}".format(len(expert_skills_list),len(task_skills_list)))

    return task_skills_list, expert_skills_list
    

In [None]:
t,e = importGuruData()
GuruTest = TFP.TeamFormationProblem(t, e)

In [None]:
runtimeDict, F_vals, workLoad_vals = GuruTest.computeTaskAssigment(algorithms=['no_update_greedy'], lambdaVal=0.1)
# runtimeDict, F_vals, workLoad_vals = GuruTest.computeTaskAssigment(algorithms=['lazy_greedy'], lambdaVal=0.1)

In [None]:
taMatList, covLoadList = GuruTest.setCoverLPTaskCoverage(20)

In [None]:
lambda_val = 0.1
for r, vals in enumerate(covLoadList):
    print("Round {}: Coverage={:.2f}, Max_Load={}, Objective(F)={:.2f}".format(r+1, vals[0], vals[1], (lambda_val*vals[0] - vals[1])))

#### IMDB Datasets

In [None]:
#Import IMDB Data
def importIMDBData(experts_filename, tasks_filename):
    with open(experts_filename, 'r') as f:
        expert_skills_list = json.loads(f.read())
    
    with open(tasks_filename, 'r') as f:
        task_skills_list = json.loads(f.read())

    print("Imported IMDB dataset. Num Experts={}, Num Tasks={}".format(len(expert_skills_list),len(task_skills_list)))

    return task_skills_list, expert_skills_list, 

#Run algorithm on IMDB datasets
def testIMDBDatasets(write_flag, algoList):
    imdb_data_path = 'datasets/imdb/'
    movieYears = [2015, 2018, 2020]

    if write_flag:
        runTimeStamp = str(time.strftime("%m-%d-%H:%M:%S", time.localtime(time.time())))
        imdb_outfilename = "experiments/imdb_" + runTimeStamp + ".txt"
        outfile_imdb = open(imdb_outfilename, "a")
        outfile_imdb.write("IMDB dataset Team-Formation Algorithms: {}\n".format(runTimeStamp))

    for y in movieYears:
        experts_file = imdb_data_path + 'imdb_experts_' + str(y) + '.txt'
        tasks_file = imdb_data_path + 'imdb_tasks_' + str(y) + '.txt'
        print("IMDB Dataset: {}, {}".format('imdb_experts_' + str(y), 'imdb_tasks_' + str(y)))

        imdb_tasks, imdb_experts = importIMDBData(experts_file, tasks_file)
        IMDBTest = TFP.TeamFormationProblem(imdb_tasks[0:600], imdb_experts[0:100])

        rt_dict, f_dict, workload_dict, coverageList = IMDBTest.computeTaskAssigment(algorithms=algoList, plot_flag=False)
        coverageListString = ""
        for c_i in coverageList:
            c_i_str = ", "+str(np.round(c_i, 2))
            coverageListString += c_i_str
        print(coverageListString)
        #Write output to file
        if write_flag:
            runInfo = "\nIMDB movieYear = {}, Experts = {}, Tasks = {}".format(str(y), str(IMDBTest.n), str(IMDBTest.m))
            outfile_imdb.write(runInfo)

            f_info = "\nAlgorithm Objectives (F_max): Lazy Greedy = {}; No-Update-Greedy = {}; Task Greedy = {}; Random = {};\
                ".format(f_dict['lazyGreedy'], f_dict['noUpdateGreedy'], f_dict['taskGreedy'], f_dict['random'])
            outfile_imdb.write(f_info)   

            wload_info = "\nAlgorithm optimal workloads: Lazy Greedy = {}; No-Update-Greedy = {}; Task Greedy = {}; Random = {};\
                ".format(workload_dict['lazyGreedy'], workload_dict['noUpdateGreedy'], workload_dict['taskGreedy'], workload_dict['random'])
            outfile_imdb.write(wload_info)   

            runtimeInfo = "\nAlgorithm Runtimes: Total = {:.3f}s; Lazy Greedy = {:.3f}s; No-Update-Greedy = {:.3f}s; Task Greedy = {:.3f}s; Random = {:.3f}s;\
                \n".format(rt_dict['total'], rt_dict['lazyGreedy'], rt_dict['noUpdateGreedy'], rt_dict['taskGreedy'], rt_dict['random'])
            outfile_imdb.write(runtimeInfo)

            outfile_imdb.write("\nCoverage List: {}".format(coverageListString))

    
    if write_flag:
        outfile_imdb.close()

    return None
    

In [None]:
# testIMDBDatasets(write_flag=True, algoList=['lazy_greedy', 'random', 'no_update_greedy', 'task_greedy'])

In [None]:
#Get coverage lists
imdb_data_path = 'datasets/imdb/'
y = 2015
experts_file = imdb_data_path + 'imdb_experts_' + str(y) + '.txt'
tasks_file = imdb_data_path + 'imdb_tasks_' + str(y) + '.txt'
print("IMDB Dataset: {}, {}".format('imdb_experts_' + str(y), 'imdb_tasks_' + str(y)))

imdb_tasks, imdb_experts = importIMDBData(experts_file, tasks_file)
IMDBTest = TFP.TeamFormationProblem(imdb_tasks, imdb_experts, max_workload_threshold=100)


In [None]:
taMatList, covLoadList_i = IMDBTest.setCoverLPTaskCoverage(10)

In [None]:
lambda_val = 0.05
for r, vals in enumerate(covLoadList_i):
    print("Round {}: Coverage={:.2f}, Max_Load={}, Objective(F)={:.2f}".format(r+1, vals[0], vals[1], (lambda_val*vals[0] - vals[1])))

In [None]:
runtimeDict, F_vals, workLoad_vals = IMDBTest.computeTaskAssigment(algorithms=['random', 'no_update_greedy', 'task_greedy'], lambdaVal=0.05)
#runtimeDict, F_vals, workLoad_vals = IMDBTest.computeTaskAssigment(algorithms=['no_update_greedy'], lambdaVal=0.1)

In [None]:
#covDict = IMDBCoverages.getStepCoverageValues()

In [None]:
imdb_data_path = 'datasets/imdb/'
y = 2015
experts_file = imdb_data_path + 'imdb_experts_' + str(y) + '.txt'
tasks_file = imdb_data_path + 'imdb_tasks_' + str(y) + '.txt'
print("IMDB Dataset: {}, {}".format('imdb_experts_' + str(y), 'imdb_tasks_' + str(y)))

imdb_tasks, imdb_experts = importIMDBData(experts_file, tasks_file)
IMDBLambdaTest = TFP.TeamFormationProblem(imdb_tasks[:1000], imdb_experts[300:600], max_workload_threshold=100)

t_arr, f_dict, t_maxArr, f_maxArr = IMDBLambdaTest.testLambdaTaskAssignment(algorithms=['lazy_greedy'])


In [None]:
#Plot F_i for different Lambda for Lazy Greedy
plt.figure(figsize=(9,6))
#for l_val in f_dict.keys():
    #plt.plot(t_arr, f_dict[l_val], label='Lambda={:.3f}'.format(l_val))

# Plot the max values
plt.plot(t_maxArr, f_maxArr, '--*', label='Max F_i')

# title_text = 'Lazy Greedy Performance by varying Lambda (IMDB_2015)'
# plt.title(title_text, fontsize=12)
plt.xlabel('Workload Threshold, T_i', fontsize=12)
plt.ylabel('Coverage, C(A)', fontsize=12)


lambda_arr = [0.3, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100]
i=0
# zip joins x and y coordinates in pairs
for x,y in zip(t_maxArr,f_maxArr):
    label = "{:.1f}".format(lambda_arr[i])
    i += 1
    plt.annotate(label, # this is the text
                 (x,y), # these are the coordinates to position the label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

plt.legend(loc='lower right')
plt.show()

#### Bibsonomy Datasets

In [None]:
#Import Bibsonomy datasets
def importBibsonomyData(experts_filename, tasks_filename):
    with open(experts_filename, 'r') as f:
        expert_skills_list = json.loads(f.read())
    
    with open(tasks_filename, 'r') as f:
        task_skills_list = json.loads(f.read())

    print("Imported Bibsonomy dataset. Num Experts={}, Num Tasks={}".format(len(expert_skills_list),len(task_skills_list)))

    return task_skills_list, expert_skills_list

#Run algorithm on Bibsonomy datasets
def testBibsonomyDatasets(write_flag, algoList):
    bibsonomy_data_path = 'datasets/bibsonomy/'
    movieYears = [2010, 2015, 2020]

    if write_flag:
        runTimeStamp = str(time.strftime("%m-%d-%H:%M:%S", time.localtime(time.time())))
        bibs_outfilename = "experiments/bibsonomy_" + runTimeStamp + ".txt"
        outfile_bibsonomy = open(bibs_outfilename, "a")
        outfile_bibsonomy.write("Bibsonomy dataset Team-Formation Algorithms: {}\n".format(runTimeStamp))

    for y in movieYears:
        experts_file = bibsonomy_data_path + 'bibsonomy_experts_' + str(y) + '.txt'
        tasks_file = bibsonomy_data_path + 'bibsonomy_tasks_' + str(y) + '.txt'
        print("\nBibsonomy Dataset: {}, {}".format('bibsonomy_experts_' + str(y), 'bibsonomy_tasks_' + str(y)))

        bib_tasks, bib_experts = importBibsonomyData(experts_file, tasks_file)
        BibsonomyTest = TFP.TeamFormationProblem(bib_tasks[0:500], bib_experts[0:200])

        rt_dict, f_dict, workload_dict = BibsonomyTest.computeTaskAssigment(algorithms=algoList, plot_flag=False)

        #Write output to file
        if write_flag:
            runInfo = "\nBibsonomy paperYear = {}, Experts = {}, Tasks = {}".format(str(y), str(BibsonomyTest.n), str(BibsonomyTest.m))
            outfile_bibsonomy.write(runInfo)

            f_info = "\nAlgorithm Objectives (F_max): Lazy Greedy = {}; No-Update-Greedy = {}; Task Greedy = {}; Random = {};\
                ".format(f_dict['lazyGreedy'], f_dict['noUpdateGreedy'], f_dict['taskGreedy'], f_dict['random'])
            outfile_bibsonomy.write(f_info)   

            wload_info = "\nAlgorithm optimal workloads: Lazy Greedy = {}; No-Update-Greedy = {}; Task Greedy = {}; Random = {};\
                ".format(workload_dict['lazyGreedy'], workload_dict['noUpdateGreedy'], workload_dict['taskGreedy'], workload_dict['random'])
            outfile_bibsonomy.write(wload_info)   

            runtimeInfo = "\nAlgorithm Runtimes: Total = {:.3f}s; Lazy Greedy = {:.3f}s; No-Update-Greedy = {:.3f}s; Task Greedy = {:.3f}s; Random = {:.3f}s;\
                \n".format(rt_dict['total'], rt_dict['lazyGreedy'], rt_dict['noUpdateGreedy'], rt_dict['taskGreedy'], rt_dict['random'])
            outfile_bibsonomy.write(runtimeInfo)
    
    if write_flag:
        outfile_bibsonomy.close()
    
    return None


In [None]:
#testBibsonomyDatasets(write_flag=True, algoList=['lazy_greedy', 'random', 'no_update_greedy', 'task_greedy'])

In [None]:
bibsonomy_data_path = 'datasets/bibsonomy/'
y=2010
experts_file = bibsonomy_data_path + 'bibsonomy_experts_' + str(y) + '.txt'
tasks_file = bibsonomy_data_path + 'bibsonomy_tasks_' + str(y) + '.txt'
print("\nBibsonomy Dataset: {}, {}".format('bibsonomy_experts_' + str(y), 'bibsonomy_tasks_' + str(y)))

bib_tasks, bib_experts = importBibsonomyData(experts_file, tasks_file)
BibsonomyTest = TFP.TeamFormationProblem(bib_tasks, bib_experts)

In [None]:
taMatList, covLoadList_b = BibsonomyTest.setCoverLPTaskCoverage(10)

In [None]:
lambda_val = 0.1
for r, vals in enumerate(covLoadList_b):
    print("Round {}: Coverage={:.2f}, Max_Load={}, Objective(F)={:.2f}".format(r+1, vals[0], vals[1], (lambda_val*vals[0] - vals[1])))

In [None]:
runtimeDict, F_vals, workLoad_vals = BibsonomyTest.computeTaskAssigment(algorithms=['random', 'no_update_greedy', 'task_greedy'], lambdaVal=0.05)

In [None]:
#Plot F_i for different Lambda for Lazy Greedy
# plt.figure(figsize=(9,6))
# for l_val in Fi_dict.keys():
#     plt.plot(T_arr, Fi_dict[l_val], label='Lambda={:.3f}'.format(l_val))

# # Plot the max values
# plt.plot(TMaxArr, FMaxArr, '--*', label='Max F_i')

# title_text = 'Lazy Greedy Performance by varying Lambda (Bibsonomy_2015)'
# plt.title(title_text, fontsize=12)
# plt.xlabel('Workload Threshold, T_i')
# plt.ylabel('F_i')
# plt.legend(loc='upper right')
# plt.show()

In [None]:
# max_threshold_arr = [5,10,40,80,100,150,200]
# rev_rt_arr, reg_rt_arr = [],[]
# for thresh in max_threshold_arr:
#     FreelancerTest = TFP.TeamFormationProblem(t[0:200], e[0:200], max_workload_threshold=thresh)
#     rev_rt, reg_rt = FreelancerTest.compare_Methods()
#     rev_rt_arr.append(rev_rt)
#     reg_rt_arr.append(reg_rt)

#Plot Runtimes
# plt.figure(figsize=(9,6))
# plt.plot(max_threshold_arr, rev_rt_arr, label='Reverse Threshold Runtime')
# plt.plot(max_threshold_arr, reg_rt_arr, label='Regular Lazy Runtime')

# title_text = 'Reverse Threshold vs. Regular Lazy runtimes'
# plt.title(title_text, fontsize=11)
# plt.xlabel('Max Threshold, T_i')
# plt.ylabel('Runtime, s')
# plt.legend(loc='lower right')
# plt.show()

#FreelancerTest.compute_reverseThreshold()
#FreelancerTest.compareTest_Lazy_Stochastic_Assignments()

In [None]:
def getCovFraction(f, maxL, lambdaVal, numTasks):
    cov = (f + maxL)/lambdaVal
    covFraction = cov/numTasks
    #print(covFraction)
    return covFraction

#Algorithm performance on all datasets
perf = {'datasets':['IMDB-15', 'IMDB-18', 'IMDB-20', 'Bibs-10', 'Bibs-15', 'Bibs-20', 'Freelancer', 'Guru'],
'lambdaVals':[0.05, 0.05, 0.1, 0.1, 0.05, 1, 0.1, 0.1], 
'Tasks':[18109, 13183, 7858, 21981, 9061, 834, 993, 3195],
'ThresholdGreedy':[(885,7), (643,8), (771,7), (2039,70), (389,27), (438,41), (88,6), (311,4)],
'TaskGreedy':[(475, 362), (339,264), (644,118), (1319,243), (96, 126), (402,93), (63,36), (225,30)],
'NoUpdateGreedy':[(720, 150), (448,200), (650,100), (1097,200), (129, 250), (408,94), (25,50), (17,33)],
'LPSetCover':[(777, 100), (474,122), (676,87), (1691,282), (336, 55), (418,84), (59,32), (287,25)]}

In [None]:
for i, dataset in enumerate(perf['datasets']):
    lambdaVal_dataset = perf['lambdaVals'][i]
    numTasks_dataset = perf['Tasks'][i]
    print("Coverage fraction for dataset: {}".format(dataset))

    #Threshold Greedy
    thresholdGreedyCov = getCovFraction(perf['ThresholdGreedy'][i][0], perf['ThresholdGreedy'][i][1],lambdaVal_dataset, numTasks_dataset)
    
    #TaskGreedy
    taskGreedyCov = getCovFraction(perf['TaskGreedy'][i][0], perf['TaskGreedy'][i][1], lambdaVal_dataset, numTasks_dataset)

    #NoUpdateGreedy
    noUpdateGreedyCov = getCovFraction(perf['NoUpdateGreedy'][i][0], perf['NoUpdateGreedy'][i][1], lambdaVal_dataset, numTasks_dataset)
    
    #LPSetCover
    lpSetCoverCov = getCovFraction(perf['LPSetCover'][i][0], perf['LPSetCover'][i][1], lambdaVal_dataset, numTasks_dataset)

    print("{} & {} & {:.2f} & {} & {} & {:.2f} & {} & {} & {:.2f} & {} & {} & {:.2f}".format(perf['ThresholdGreedy'][i][0], perf['ThresholdGreedy'][i][1], thresholdGreedyCov,
                                                                                            perf['LPSetCover'][i][0], perf['LPSetCover'][i][1], lpSetCoverCov,
                                                                                             perf['TaskGreedy'][i][0], perf['TaskGreedy'][i][1], taskGreedyCov, 
                                                                                             perf['NoUpdateGreedy'][i][0], perf['NoUpdateGreedy'][i][1], noUpdateGreedyCov))
