In [1]:
## Test Team Formation framework with Freelancer Data
## Balancing Task Coverage vs. Maximum Expert Load
## Karan Vombatkere, Dec 2021

#Imports
import pandas as pd
import matplotlib.pyplot as plt
import json, time
import TeamFormationProblem as TFP

#### Simple synthetic Data

In [None]:
#Preliminary Testing
task_list = [['s1', 's2', 's4', 's5'], ['s3', 's2'], ['s1', 's3', 's5'], ['s2', 's4'],['s1', 's4', 's5'], ['s3'], ['s1', 's5'], ['s2', 's4']]
expert_list = [['s2', 's3'], ['s2', 's4'], ['s3'], ['s5', 's4']]

teamFormationTest = TFP.TeamFormationProblem(task_list, expert_list, max_workload_threshold=6)
#teamFormationTest.computeTaskAssigment(lazy_eval=False)

In [None]:
rtdict = teamFormationTest.computeTaskAssigment(baselines=['task_greedy'], plot_flag=False)

In [None]:
#teamFormationTest.compareTest_Lazy_Stochastic_Assignments()
#t,f, r = teamFormationTest.compute_reverseThreshold()
#reverse_runtime, regular_runtime = teamFormationTest.compare_Methods()

#### Freelancer Dataset

In [None]:
#Import Freelancer data
#Freelancer from DropBox link: https://www.dropbox.com/sh/8zpsi1etvvvvj5k/AAD-J9ZQmSsbnSmEILBMD9uxa/datasets/real?dl=0&subfolder_nav_tracking=1
#freelance_experts.csv and freelance_projects.csv

def extract_skills(row):
    skills = []
    for i,val in enumerate(row):
        if val == 1:
            skills.append(str(i))
    return skills            

    
def importFreelancerData(experts_filename='datasets/freelancer/freelancer_experts.csv', tasks_filename='datasets/freelancer/freelancer_projects.csv'):
    #Extract tasks skills as list
    freelance_tasks_df = pd.read_csv(tasks_filename, header=None)
    freelance_tasks_df['Task_Skills'] = freelance_tasks_df.apply(lambda row: extract_skills(row), axis=1)
    task_skills_list = freelance_tasks_df.Task_Skills.to_list()
    
    #Extract experts skills as list
    freelance_experts_df = pd.read_csv(experts_filename, header=None)
    freelance_experts_df['Expert_Skills'] = freelance_experts_df.apply(lambda row: extract_skills(row), axis=1)
    expert_skills_list = freelance_experts_df.Expert_Skills.to_list()

    print("Imported Freelancer dataset. Num Experts={}, Num Tasks={}".format(len(expert_skills_list),len(task_skills_list)))

    return task_skills_list, expert_skills_list
    

In [None]:
t,e = importFreelancerData()
FreelancerTest = TFP.TeamFormationProblem(t, e, max_workload_threshold=50)

In [None]:
rtimes = FreelancerTest.computeTaskAssigment(baselines=['random','no_update_greedy'], plot_flag=True)

#### IMDB Datasets

In [2]:
#Import IMDB Data
def importIMDBData(experts_filename, tasks_filename):
    with open(experts_filename, 'r') as f:
        expert_skills_list = json.loads(f.read())
    
    with open(tasks_filename, 'r') as f:
        task_skills_list = json.loads(f.read())

    print("Imported IMDB dataset. Num Experts={}, Num Tasks={}".format(len(expert_skills_list),len(task_skills_list)))

    return task_skills_list, expert_skills_list, 

#Run algorithm on IMDB datasets
def testIMDBDatasets():
    imdb_data_path = 'datasets/imdb/'
    movieYears = [2015, 2018, 2020]

    runTimeStamp = str(time.strftime("%m-%d-%H:%M:%S", time.localtime(time.time())))
    imdb_outfilename = "experiments/imdb_" + runTimeStamp + ".txt"
    outfile_imdb = open(imdb_outfilename, "a")
    outfile_imdb.write("IMDB dataset Team-Formation Algorithms: {}\n".format(runTimeStamp))

    for y in movieYears:
        experts_file = imdb_data_path + 'imdb_experts_' + str(y) + '.txt'
        tasks_file = imdb_data_path + 'imdb_tasks_' + str(y) + '.txt'
        print("IMDB Dataset: {}, {}".format('imdb_experts_' + str(y), 'imdb_tasks_' + str(y)))

        imdb_tasks, imdb_experts = importIMDBData(experts_file, tasks_file)
        IMDBTest = TFP.TeamFormationProblem(imdb_tasks[0:100], imdb_experts[0:100])

        rt_dict = IMDBTest.computeTaskAssigment(baselines=['random', 'no_update_greedy', 'task_greedy'], plot_flag=False)

        #Write output to file
        runInfo = "\nIMDB movieYear = {}, Experts = {}, Tasks = {}".format(str(y), str(IMDBTest.n), str(IMDBTest.m))
        outfile_imdb.write(runInfo)

        runtimeInfo = "\nAlgorithm Runtimes: Total = {:.3f}s; Lazy Greedy = {:.3f}s; No-Update-Greedy = {:.3f}s; Task Greedy = {:.3f}s; Random = {:.3f}s;\
            \n".format(rt_dict['total'], rt_dict['lazyGreedy'], rt_dict['noUpdateGreedy'], rt_dict['taskGreedy'], rt_dict['random'])
        outfile_imdb.write(runtimeInfo)
        
    outfile_imdb.close()

    return None
    

In [3]:
testIMDBDatasets()

2022-04-10 23:23:01,222 |INFO: ------------Team Formation Problem initialized with 100 tasks and 100 experts---------
2022-04-10 23:23:01,260 |INFO: Pre-Computed Lambda value = 0.24
2022-04-10 23:23:01,261 |INFO: --------------------------Computing Greedy Task Assignment (Lazy Eval)------------------------------------
2022-04-10 23:23:01,290 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=1, F_i=13.537
2022-04-10 23:23:01,294 |INFO: Computed Baseline Random Task Assignment for T_i=1, F_i = 2.998
2022-04-10 23:23:01,312 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=1, F_i = 1.360
2022-04-10 23:23:01,343 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=2, F_i=18.670
2022-04-10 23:23:01,351 |INFO: Computed Baseline Random Task Assignment for T_i=2, F_i = 5.587
2022-04-10 23:23:01,373 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=2, F_i = 2.720
2022-04-10 23:23:01,407 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=3, 

IMDB Dataset: imdb_experts_2015, imdb_tasks_2015
Imported IMDB dataset. Num Experts=5551, Num Tasks=18109


2022-04-10 23:23:01,438 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=3, F_i = 3.640
2022-04-10 23:23:01,471 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=4, F_i=19.672
2022-04-10 23:23:01,485 |INFO: Computed Baseline Random Task Assignment for T_i=4, F_i = 9.576
2022-04-10 23:23:01,508 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=4, F_i = 5.040
2022-04-10 23:23:01,508 |INFO: Baseline Task Greedy Task Assignment
2022-04-10 23:23:01,560 |INFO: Baseline Task Greedy F_i = 0.000
2022-04-10 23:23:01,560 |INFO: Best Task Assignment is for max workload threshold: 3, F_i(max)=19.971 

2022-04-10 23:23:01,561 |INFO: 
Algorithm Runtimes: Total = 0.338s; Lazy Greedy = 0.129s; No-Update-Greedy = 0.081s; Task Greedy = 0.052s; Random = 0.036s;            

2022-04-10 23:23:01,612 |INFO: ------------Team Formation Problem initialized with 100 tasks and 100 experts---------
2022-04-10 23:23:01,644 |INFO: Pre-Computed Lambda value = 0.30526315789473

IMDB Dataset: imdb_experts_2018, imdb_tasks_2018
Imported IMDB dataset. Num Experts=3871, Num Tasks=13183


2022-04-10 23:23:01,799 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=3, F_i=25.482
2022-04-10 23:23:01,810 |INFO: Computed Baseline Random Task Assignment for T_i=3, F_i = 11.994
2022-04-10 23:23:01,837 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=3, F_i = 6.049
2022-04-10 23:23:01,870 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=4, F_i=24.855
2022-04-10 23:23:01,884 |INFO: Computed Baseline Random Task Assignment for T_i=4, F_i = 13.659
2022-04-10 23:23:01,913 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=4, F_i = 7.516
2022-04-10 23:23:01,914 |INFO: Baseline Task Greedy Task Assignment
2022-04-10 23:23:01,962 |INFO: Baseline Task Greedy F_i = -2.000
2022-04-10 23:23:01,962 |INFO: Best Task Assignment is for max workload threshold: 3, F_i(max)=25.482 

2022-04-10 23:23:01,962 |INFO: 
Algorithm Runtimes: Total = 0.349s; Lazy Greedy = 0.129s; No-Update-Greedy = 0.106s; Task Greedy = 0.048s; Random = 0.034s;            

IMDB Dataset: imdb_experts_2020, imdb_tasks_2020
Imported IMDB dataset. Num Experts=2176, Num Tasks=7858


2022-04-10 23:23:02,192 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=4, F_i=19.925
2022-04-10 23:23:02,206 |INFO: Computed Baseline Random Task Assignment for T_i=4, F_i = 8.363
2022-04-10 23:23:02,231 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=4, F_i = 4.726
2022-04-10 23:23:02,232 |INFO: Baseline Task Greedy Task Assignment
2022-04-10 23:23:02,279 |INFO: Baseline Task Greedy F_i = 0.000
2022-04-10 23:23:02,280 |INFO: Best Task Assignment is for max workload threshold: 3, F_i(max)=20.420 

2022-04-10 23:23:02,280 |INFO: 
Algorithm Runtimes: Total = 0.314s; Lazy Greedy = 0.116s; No-Update-Greedy = 0.086s; Task Greedy = 0.048s; Random = 0.033s;            



#### Bibsonomy Datasets

In [4]:
#Import Bibsonomy datasets
def importBibsonomyData(experts_filename, tasks_filename):
    with open(experts_filename, 'r') as f:
        expert_skills_list = json.loads(f.read())
    
    with open(tasks_filename, 'r') as f:
        task_skills_list = json.loads(f.read())

    print("Imported Bibsonomy dataset. Num Experts={}, Num Tasks={}".format(len(expert_skills_list),len(task_skills_list)))

    return task_skills_list, expert_skills_list

#Run algorithm on Bibsonomy datasets
def testBibsonomyDatasets():
    imdb_data_path = 'datasets/bibsonomy/'
    movieYears = [2010, 2015, 2020]

    runTimeStamp = str(time.strftime("%m-%d-%H:%M:%S", time.localtime(time.time())))
    bibs_outfilename = "experiments/bibsonomy_" + runTimeStamp + ".txt"
    outfile_bibsonomy = open(bibs_outfilename, "a")
    outfile_bibsonomy.write("Bibsonomy dataset Team-Formation Algorithms: {}\n".format(runTimeStamp))

    for y in movieYears:
        experts_file = imdb_data_path + 'bibsonomy_experts_' + str(y) + '.txt'
        tasks_file = imdb_data_path + 'bibsonomy_tasks_' + str(y) + '.txt'
        print("\nBibsonomy Dataset: {}, {}".format('bibsonomy_experts_' + str(y), 'bibsonomy_tasks_' + str(y)))

        bib_tasks, bib_experts = importBibsonomyData(experts_file, tasks_file)
        BibsonomyTest = TFP.TeamFormationProblem(bib_tasks[0:100], bib_experts[0:200])

        rt_dict = BibsonomyTest.computeTaskAssigment(baselines=['random', 'no_update_greedy', 'task_greedy'], plot_flag=False)

        #Write output to file
        runInfo = "\nBibsonomy paperYear = {}, Experts = {}, Tasks = {}".format(str(y), str(BibsonomyTest.n), str(BibsonomyTest.m))
        outfile_bibsonomy.write(runInfo)

        runtimeInfo = "\nAlgorithm Runtimes: Total = {:.3f}s; Lazy Greedy = {:.3f}s; No-Update-Greedy = {:.3f}s; Task Greedy = {:.3f}s; Random = {:.3f}s;\
            \n".format(rt_dict['total'], rt_dict['lazyGreedy'], rt_dict['noUpdateGreedy'], rt_dict['taskGreedy'], rt_dict['random'])
        outfile_bibsonomy.write(runtimeInfo)

    outfile_bibsonomy.close()
    
    return None


In [5]:
testBibsonomyDatasets()

2022-04-10 23:23:09,634 |INFO: ------------Team Formation Problem initialized with 100 tasks and 200 experts---------
2022-04-10 23:23:09,688 |INFO: Pre-Computed Lambda value = 0.38434837488166623
2022-04-10 23:23:09,688 |INFO: --------------------------Computing Greedy Task Assignment (Lazy Eval)------------------------------------
2022-04-10 23:23:09,735 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=1, F_i=17.658
2022-04-10 23:23:09,741 |INFO: Computed Baseline Random Task Assignment for T_i=1, F_i = 2.826
2022-04-10 23:23:09,781 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=1, F_i = 9.621
2022-04-10 23:23:09,831 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=2, F_i=20.343



Bibsonomy Dataset: bibsonomy_experts_2010, bibsonomy_tasks_2010
Imported Bibsonomy dataset. Num Experts=3044, Num Tasks=21981


2022-04-10 23:23:09,843 |INFO: Computed Baseline Random Task Assignment for T_i=2, F_i = 4.946
2022-04-10 23:23:09,922 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=2, F_i = 11.817
2022-04-10 23:23:09,973 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=3, F_i=20.734
2022-04-10 23:23:09,993 |INFO: Computed Baseline Random Task Assignment for T_i=3, F_i = 6.078
2022-04-10 23:23:10,077 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=3, F_i = 13.224
2022-04-10 23:23:10,125 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=4, F_i=20.365
2022-04-10 23:23:10,153 |INFO: Computed Baseline Random Task Assignment for T_i=4, F_i = 5.677
2022-04-10 23:23:10,250 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=4, F_i = 13.896
2022-04-10 23:23:10,251 |INFO: Baseline Task Greedy Task Assignment
2022-04-10 23:23:10,451 |INFO: Baseline Task Greedy F_i = 1.000
2022-04-10 23:23:10,452 |INFO: Best Task Assignment is for max workload


Bibsonomy Dataset: bibsonomy_experts_2015, bibsonomy_tasks_2015
Imported Bibsonomy dataset. Num Experts=1904, Num Tasks=9061


2022-04-10 23:23:10,697 |INFO: Computed Baseline Random Task Assignment for T_i=2, F_i = -0.106
2022-04-10 23:23:10,771 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=2, F_i = 4.696
2022-04-10 23:23:10,815 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=3, F_i=7.447
2022-04-10 23:23:10,834 |INFO: Computed Baseline Random Task Assignment for T_i=3, F_i = -0.574
2022-04-10 23:23:10,913 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=3, F_i = 5.302
2022-04-10 23:23:10,913 |INFO: Baseline Task Greedy Task Assignment
2022-04-10 23:23:11,094 |INFO: Baseline Task Greedy F_i = 0.000
2022-04-10 23:23:11,095 |INFO: Best Task Assignment is for max workload threshold: 2, F_i(max)=7.940 

2022-04-10 23:23:11,095 |INFO: 
Algorithm Runtimes: Total = 0.582s; Lazy Greedy = 0.131s; No-Update-Greedy = 0.190s; Task Greedy = 0.182s; Random = 0.035s;            

2022-04-10 23:23:11,097 |INFO: ------------Team Formation Problem initialized with 100 tasks and 1


Bibsonomy Dataset: bibsonomy_experts_2020, bibsonomy_tasks_2020
Imported Bibsonomy dataset. Num Experts=177, Num Tasks=834


2022-04-10 23:23:11,325 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=2, F_i = 6.591
2022-04-10 23:23:11,353 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=3, F_i=8.733
2022-04-10 23:23:11,367 |INFO: Computed Baseline Random Task Assignment for T_i=3, F_i = -0.947
2022-04-10 23:23:11,438 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=3, F_i = 6.886
2022-04-10 23:23:11,465 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=4, F_i=8.848
2022-04-10 23:23:11,484 |INFO: Computed Baseline Random Task Assignment for T_i=4, F_i = -1.835
2022-04-10 23:23:11,559 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=4, F_i = 6.816
2022-04-10 23:23:11,587 |INFO: Computed Greedy Task Assignment (Lazy Eval) for T_i=5, F_i=8.615
2022-04-10 23:23:11,611 |INFO: Computed Baseline Random Task Assignment for T_i=5, F_i = -2.882
2022-04-10 23:23:11,694 |INFO: Computed Baseline No-Update Greedy Task Assignment for T_i=5, F_i = 6.682
2022

In [None]:
# max_threshold_arr = [5,10,40,80,100,150,200]
# rev_rt_arr, reg_rt_arr = [],[]
# for thresh in max_threshold_arr:
#     FreelancerTest = TFP.TeamFormationProblem(t[0:200], e[0:200], max_workload_threshold=thresh)
#     rev_rt, reg_rt = FreelancerTest.compare_Methods()
#     rev_rt_arr.append(rev_rt)
#     reg_rt_arr.append(reg_rt)

#Plot Runtimes
# plt.figure(figsize=(9,6))
# plt.plot(max_threshold_arr, rev_rt_arr, label='Reverse Threshold Runtime')
# plt.plot(max_threshold_arr, reg_rt_arr, label='Regular Lazy Runtime')

# title_text = 'Reverse Threshold vs. Regular Lazy runtimes'
# plt.title(title_text, fontsize=11)
# plt.xlabel('Max Threshold, T_i')
# plt.ylabel('Runtime, s')
# plt.legend(loc='lower right')
# plt.show()

#FreelancerTest.compute_reverseThreshold()
#FreelancerTest.compareTest_Lazy_Stochastic_Assignments()