# Dataset Construction: RQ1 and RQ2

This notebook constructs the datasets used in the analyses for research questions 1 and 2. To see the actual statistical analyses, see the R scripts.

First load the PWC provided data. "evaluation-tables.json" and "papers-with-abstracts.json" were downloaded from [PWC's Github](https://github.com/paperswithcode/paperswithcode-data) on 06/16/2021.

In [None]:
DATASET_PATH="/home/bkoch/Documents/GitHub/Life_of_a_Benchmark/Dataset_Curation"
%cd DATASET_PATH
import pandas as pd
import json
with open('./PWC_Data/papers-with-abstracts.json') as f:
    pwc=json.load(f)
with open('./PWC_Data/evaluation-tables.json') as f:
    benchmark_tables=json.load(f)

### Construct Task Ontology

This block parses "evaluation-tables.json" to create a task ontology. Benchmarks in this file are organized by task and subtask.

In [None]:
all_rows=[] #used to construct benchmark_papers df
parent_child_dict={} #dictionary capturing parent-child task relations used to create task_relations df
child_parent_dict={} #inverse of above
task_category_dict={} #captures task category relations: categories in PWC are larger domains like "NLP","CV","Methodology
paper_titles_tasks={} #paper titles to tasks
paper_titles_parent_tasks={} #paper totiles to parent tasks
dataset_associated_tasks={} # tasks associated with the dataset

for i, task in enumerate(benchmark_tables):
    task_dict={} # A dictionary for each task that will ultimately be a row in the benchmark_papers dataframe
    task_dict['task']=task['task']
    task_dict['task_categories']=task['categories']
    task_dict['task_description']=task['description']
    task_dict['parent_task']=task['task']
    if task['task'] not in parent_child_dict: parent_child_dict[task['task']]=[]
    if task['task'] not in child_parent_dict: child_parent_dict[task['task']]=[]
    task_category_dict[task['task']]=task['categories']
    if len(task['datasets'])!=0: #if there are datasets associated with task
        for j,d in enumerate(task['datasets']):
            if d['dataset'] not in dataset_associated_tasks:dataset_associated_tasks[d['dataset']]=[]
            parent_dataset=d['dataset'] # variations of datasets can be listed as "children"
            dataset_associated_tasks[d['dataset']].append(task['task'])
            dataset_dict={}
            dataset_dict.update(task_dict)
            dataset_dict['dataset']=d['dataset']
            dataset_dict['dataset_citations']=d['dataset_citations']
            dataset_dict['dataset_links']=d['dataset_links']
            dataset_dict['dataset_subdatasets']=d['subdatasets']
            dataset_dict['task']=task['task']
            for row in d['sota']['rows']:
                for m in row['metrics']:
                    row_dict=dict(row)
                    row_dict['metrics']=m
                    row_dict['score']=row['metrics'][m]
                    row_dict.update(dataset_dict)
                    all_rows.append(row_dict)
                if row['paper_title'] not in paper_titles_tasks:paper_titles_tasks[row['paper_title']]=[]
                if row['paper_title'] not in paper_titles_parent_tasks:paper_titles_parent_tasks[row['paper_title']]=[]
                paper_titles_tasks[row['paper_title']]+=[task['task']]
                paper_titles_parent_tasks[row['paper_title']]+=[task['task']]
    if len(task['subtasks'])!=0: #tasks can have subtasks. This is not tree. A subtask could have multiple parents or be a parent itself.
        for t in task['subtasks']:
            task_dict={} #Each subtask is it's own row also
            if t['task'] not in child_parent_dict: child_parent_dict[t['task']]=[]
            if t['task'] not in parent_child_dict: parent_child_dict[t['task']]=[]
            task_category_dict[t['task']]=t['categories']
            parent_child_dict[task['task']].append(t['task'])
            child_parent_dict[t['task']].append(task['task'])
            
            task_dict['parent_task']=task['task']
            task_dict['task']=t['task']
            task_dict['task_categories']=','.join(t['categories'])
            task_dict['task_description']=t['description']
            
            if len(t['datasets'])!=0:
                for d in t['datasets']:
                    if d['dataset'] not in dataset_associated_tasks:dataset_associated_tasks[d['dataset']]=[]
                    dataset_associated_tasks[d['dataset']].append(t['task'])
                    dataset_associated_tasks[parent_dataset].append(t['task'])
                    dataset_dict={}
                    dataset_dict.update(task_dict)
                    dataset_dict['dataset']=d['dataset']
                    dataset_dict['dataset_citations']=d['dataset_citations']
                    dataset_dict['dataset_links']=d['dataset_links']
                    dataset_dict['dataset_subdatasets']=d['subdatasets']
                    dataset_dict['task']=t['task']
                    for row in d['sota']['rows']:
                        for m in row['metrics']:
                            row_dict=dict(row)
                            row_dict['metrics']=m
                            row_dict['score']=row['metrics'][m]
                            row_dict.update(dataset_dict)
                            all_rows.append(row_dict)
                        if row['paper_title'] not in paper_titles_tasks:paper_titles_tasks[row['paper_title']]=[]
                        paper_titles_tasks[row['paper_title']]+=[t['task']] 
                        if row['paper_title'] not in paper_titles_parent_tasks:paper_titles_parent_tasks[row['paper_title']]=[]
                        paper_titles_parent_tasks[row['paper_title']]+=[task['task']] 

#contains all relevant info at papers used in benchmarks
benchmark_papers=pd.DataFrame(all_rows)
benchmark_papers=benchmark_papers.drop_duplicates(['model_name','dataset','task','metrics','score'])
benchmark_papers['benchmark_id']=benchmark_papers.groupby(['task','dataset','metrics']).ngroup()
benchmark_papers=benchmark_papers[['paper_title','paper_date','task','parent_task','dataset','score','benchmark_id','metrics','task_categories']]
benchmark_papers=benchmark_papers.drop_duplicates(['paper_title','paper_date','task','parent_task','dataset','score','benchmark_id','metrics'])
benchmark_papers['CV']=benchmark_papers.apply(lambda row: 'Computer Vision' in row['task_categories'] and len(row['task_categories'])==1,axis=1 )
benchmark_papers['NLP']=benchmark_papers.apply(lambda row: 'Natural Language Processing' in row['task_categories'] and len(row['task_categories'])==1,axis=1 )
print(benchmark_papers.shape)

#constructs sibling relationships from all tasks,
#Note that because it is not a tree, sibling relations do not correspond exactly to parent child
sibling_dict={i:[] for i in child_parent_dict.keys()}
for k in sibling_dict.keys():
    parents=child_parent_dict[k]
    for i in parents:
        sibling_dict[k]+=parent_child_dict[i]
    sibling_dict[k]=list(set(sibling_dict[k]))

# Create the ontology of task relations
task_relations=pd.DataFrame({'categories':task_category_dict,'parents':child_parent_dict,'children':parent_child_dict,'siblings':sibling_dict})
task_relations.index=task_relations.index.rename('task')
task_relations=task_relations.reset_index()

#Create a df labeling papers with tasks and parent tasks
paper_relations=pd.DataFrame({'all_tasks':paper_titles_tasks,'all_parent_tasks':paper_titles_parent_tasks})
paper_relations.index=paper_relations.index.rename('title')
paper_relations=paper_relations.reset_index()
task_relations.to_csv('task_relations.tsv',sep='\t',quoting=1)

print("Number of papers used to construct task ontology: ",benchmark_papers['paper_title'].drop_duplicates().shape)
print("Number of tasks: ", task_relations.shape[0])
print("Mean number of parents/children for a task: ", task_relations['parents'].apply(lambda x: len(x)).mean())
print("Mean number of siblings for a task: ", task_relations['siblings'].apply(lambda x: len(x)).mean())
#At least in the benchmark relations, it appears that PWC does have a tree like structure

benchmark_papers.to_json('./PWC_Data/Derivative_Datasets/benchmarks_with_datasets.json')

This block goes through papers with abstracts, constructs a DF with unique task-papers (not unique papers) and annotates task relations from previous block.

Furthermore, we add our additional manual annotations for papers that introduced datasets and were either not in papers-with-abstracts.json or had no tasks labeled.

In [None]:
#These are all columns in the df for each paper
title=[]
pdf_url=[]
paper_url=[]
date=[]
task=[]
all_tasks=[]
all_parents=[]
all_children=[]
all_categories=[]
all_siblings=[]

#These are used later but they note the number of papers per task and the first appearance of a paper with a task
task_hist={}
task_age={}


#These are the datset-introducing papers that two authors manually annotated.
#Note that only papers with the "Justification" column are those we both reviewed. 
sheet_id = '1Y3DDI6ySi9A6l3ZMET29EWSxBr8Uw-kvKn8RF2zYpzQ'
sheet_name = 'untasked_datasets'
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
manual_task_labels=pd.read_csv(url)
manual_tasks_not_labeled=manual_task_labels[manual_task_labels.Justification.isnull()]
manual_task_labels=manual_task_labels[~manual_task_labels.Justification.isnull()]
print("Number of manually annotated dataset-introducing papers: ",manual_task_labels.shape[0])

for i in pwc:
    if i['title'] in list(manual_task_labels.title.str.strip()):
        proposed_tasks=manual_task_labels[manual_task_labels.title==i['title']]['Proposed Tasks'].iloc[0]
        #in addition to any tasks that were already there (although shouldn't be any), add the manually annotated ones
        i['tasks']+=[j.strip() for j in proposed_tasks.split(',')]
    if len(i['tasks'])==0: continue
    #these comprehensions pool all parents,children, and children for ALL tasks the paper is labeled with
    ap=[]
    [ ap.extend(child_parent_dict[t]) for t in i['tasks'] if t in child_parent_dict]
    ac =[]
    [ ac.extend(task_category_dict[t]) for t in i['tasks'] if t in task_category_dict]
    ah=[]
    [ ah.extend(parent_child_dict[t]) for t in i['tasks'] if t in parent_child_dict]
    asib=[]
    [ asib.extend(sibling_dict[t]) for t in i['tasks'] if t in sibling_dict]

    for t in i['tasks']:
        #just keep up with task ages and counts
        if t not in task_hist: task_hist[t]=0
        task_hist[t]+=1
        if t not in task_age: task_age[t] = pd.to_datetime(i['date']).year
        else: task_age[t]= min(task_age[t],pd.to_datetime(i['date']).year)
        
        #add a row for each task-paper
        title.append(i['title'])
        pdf_url.append(i['url_pdf'])
        paper_url.append(i['paper_url'])
        date.append(i['date'])
        task.append(t)
        all_tasks.append(i['tasks'])
        all_parents.append(list(set(ap)))
        all_categories.append(list(set(ac)))
        all_children.append(list(set(ah)))
        all_siblings.append(list(set(asib)))

#There are some mannually annotated dataset introducing papers that are not in papers-with-abstracts.json
for i,row in manual_task_labels.iterrows():
    if row['title'] in title: continue
    if type(row['Proposed Tasks'])==float:continue
    tasks=[j.strip() for j in row['Proposed Tasks'].split(',')]
    ap=[]
    [ ap.extend(child_parent_dict[t]) for t in tasks if t in child_parent_dict]
    ac =[]
    [ ac.extend(task_category_dict[t]) for t in tasks if t in task_category_dict]
    ah=[]
    [ ah.extend(parent_child_dict[t]) for t in tasks if t in parent_child_dict]
    asib=[]
    [ asib.extend(sibling_dict[t]) for t in tasks if t in sibling_dict]
    for t in tasks:
        if t not in task_category_dict: print("TASK NOT FOUND",t)

        if t not in task_hist: task_hist[t]=0
        task_hist[t]+=1
        if t not in task_age: task_age[t] = pd.to_datetime(row['introduced_date']).year
        else: task_age[t]= min(task_age[t],pd.to_datetime(row['introduced_date']).year)
        title.append(row['title'])
        pdf_url.append(None)
        paper_url.append(row['paper_url'])
        date.append(row['introduced_date'])
        task.append(t)
        all_tasks.append(tasks)
        all_parents.append(list(set(ap)))
        all_categories.append(list(set(ac)))
        all_children.append(list(set(ah)))
        all_siblings.append(list(set(asib)))

pwc_papers=pd.DataFrame({'title':title,'pdf_url':pdf_url,'paper_url':paper_url,'date':date,'task':task,
                         'all_tasks':all_tasks,'all_parents':all_parents,'all_children':all_children,
                         'all_siblings':all_siblings,'all_categories':all_categories,})

print("Total PWC papers in papers with abstracts that have tasks: ",pwc_papers['title'].drop_duplicates().shape)
task_hist=pd.Series(task_hist)
#task_hist=task_hist[task_hist>task_hist.quantile(.3)]
task_age=pd.Series(task_age)
task_age.name='task_age'

pwc_papers.to_json('./PWC_Data/Derivative_Datasets/pwc_papers.json')

# Here we merge with task relations from the benchmarks
pwc_papers_task=pd.merge(task_relations,pwc_papers,on='task')
print("PWC papers lost because none of it's tasks are in the benchmarks dataset: ",
      pwc_papers_task['title'].drop_duplicates().shape[0]-pwc_papers_task['title'].drop_duplicates().shape[0])

In [None]:
print(url)

Third step is to load and clean the datasets from the PWC file datasets.json

In [None]:

#This is available from PWC
with open('/home/bkoch/Projects/DataProject/analyses/04-PWC/PWC_2021_06_16/datasets.json') as f:
    datasets=pd.DataFrame(json.load(f))

#Basic cleaning
datasets['title']=datasets['paper'].apply(lambda js: js['title'] if js is not None else None)
datasets['paper_url']=datasets['paper'].apply(lambda js: js['url'] if js is not None else None)
datasets['introduced_date']=pd.to_datetime(datasets['introduced_date'])
datasets['Texts']=datasets['modalities'].apply(lambda r: 'Texts' in r)
datasets['Images']=datasets['modalities'].apply(lambda r: 'Images' in r)
datasets['dataset_tasks']=datasets['tasks'].apply(lambda js: [ j['task'] for j in js])

#Again add tasks from manually labeled dataset-papers
manual_dict=manual_task_labels[['name','Proposed Tasks']]
manual_dict['Proposed Tasks']=manual_dict['Proposed Tasks'].apply(lambda x: [j.strip() for j in x.split(',')] if type(x)!=float else [])
manual_dict=manual_dict.set_index('name').to_dict()['Proposed Tasks']
datasets['dataset_tasks']=datasets.apply(lambda row: row['dataset_tasks']+manual_dict[row['name']] if row['name'] in manual_dict else row['dataset_tasks'],axis=1)
datasets=datasets.drop(['tasks','paper'],axis=1)

#add all task relations to datasets
all_parents=[]
all_children=[]
all_categories=[]
all_siblings=[]
for i,row in datasets.iterrows():
    ap=[]
    [ ap.extend(child_parent_dict[t]) for t in row['dataset_tasks'] if t in child_parent_dict]
    ac =[]
    [ ac.extend(task_category_dict[t]) for t in row['dataset_tasks'] if t in task_category_dict]
    ah=[]
    [ ah.extend(parent_child_dict[t]) for t in row['dataset_tasks'] if t in parent_child_dict]
    asib=[]
    [ asib.extend(sibling_dict[t]) for t in row['dataset_tasks'] if t in sibling_dict]
    all_parents.append(ap)
    all_categories.append(ac)
    all_children.append(ah)
    all_siblings.append(asib)
datasets['dataset_tasks_parents']=all_parents
datasets['dataset_tasks_categories']=all_categories
datasets['dataset_tasks_children']=all_children
datasets['dataset_tasks_siblings']=all_siblings
datasets.to_json('./dataset_with_tasks.json')
print("Total number of datasets in PWC: ",datasets['name'].drop_duplicates().shape[0])



# This is one dataset I noticed that is totally messed up. Going to drop it.
datasets=datasets[datasets.name!='PRID2011']


#datasets_pwc_total=pd.merge(datasets,pwc_papers_task,on=['title','paper_url'],how='left')
#datasets_pwc_total.to_json('./DatawithTasks/datasets_total.json')

datasets_pwc=pd.merge(datasets.drop('paper_url',axis=1),pwc_papers_task.drop('paper_url',axis=1),on=['title'])
#datasets_pwc['all_tasks']=datasets_pwc.apply(lambda row: list(set(row['all_tasks']+row['dataset_tasks'])) if row['name'] in manual_dict else row['all_tasks'],axis=1)

print("Datasets affiliated with a paper in PWC: ",datasets_pwc['name'].drop_duplicates().shape[0])
datasets_pwc.to_json('./PWC_Data/Derivative_Datasets/datasets_pwc.json')

4. Load dataset-citing papers. This scraped with an API in the script Get_Dataset_Citing_Papers.ipynb Here we again link to papers-with-abstracts.json in order to get task information. We lose a significant number of papers doing this, but it's hard to say whether these papers are real usages or just keyword hits anyways. It is possible that manual annotation could recover some of these papers.

In [None]:
dataset_citing_papers=pd.read_csv('./PWC_Data/Derivative_Datasets/datasets_citing_papers.txt',sep='\t')
print("Dataset citing papers harvested from PWC internal API:",dataset_citing_papers['title'].drop_duplicates().shape[0])

dataset_citing_papers_pwc=pd.merge(dataset_citing_papers,pwc_papers_task,on=['title','date'])
dataset_citing_papers_pwc['date']=pd.to_datetime(dataset_citing_papers_pwc['date'])
print("Dataset citing papers that are actually labeled wth tasks: ",dataset_citing_papers_pwc['title'].drop_duplicates().shape[0])
dataset_citing_papers_pwc.to_json('./PWC_Data/Derivative_Datasets/datasets_citing_papers_pwc.json')


In [None]:
print("Number of usages recovered through manual annotation:", dataset_citing_papers_pwc[dataset_citing_papers_pwc.name.isin(manual_task_labels.name)].shape[0])
print("Number of usages recovered through manual annotation:",dataset_citing_papers_pwc[dataset_citing_papers_pwc.name.isin(manual_tasks_not_labeled.name)].shape[0])
print("Percentage of total usages we're dropping: ",dataset_citing_papers_pwc[dataset_citing_papers_pwc.name.isin(manual_tasks_not_labeled.name)].shape[0]/dataset_citing_papers_pwc.shape[0])

# Curating Transfer Datasets

Now we're going to combine our four datasets to do dataset transfers. This block merges each citing task-paper with each dataset's task paper. There are three different lists we keep track of:
1. The birth of datasets
2. Datasets used by others within the same task
3. Datasets used by tasks within another dataset

Note this is pretty inefficient for the sake of clarity. I would only run this block once and then reload results in the next block

In [None]:
# Now create infomap inputs

# Now create infomap inputs
dataset_citing_papers_origins=pd.merge(datasets_pwc,dataset_citing_papers_pwc,on='name')
dataset_citing_papers_origins.columns
#I realize this is incredibly stupid, I had done it a more elegant way but this works
dataset_citing_papers_origins.rename({'title_x':'origin_title',
                                     'all_tasks_x':'origin_tasks',
                                     'all_parents_x':'origin_parents',
                                     'all_siblings_x':'origin_siblings',
                                     'all_children_x':'origin_children'},axis=1,inplace=True)
dataset_citing_papers_origins.columns=[i.replace('_y','') for i in dataset_citing_papers_origins.columns]
dataset_citing_papers_origins=dataset_citing_papers_origins[dataset_citing_papers_origins.title!=dataset_citing_papers_origins.origin_title]

task_contains_images={}
task_contains_texts={}

#external adoptions to siblings and parents
sources=[]
destinations=[]
ds_names=[]
paper_titles=[]
ds_texts=[]
ds_images=[]
dest_dates=[]
parent_transfer=[]


#adoption of homegrown datasets
home_tasks=[]
home_names=[]
home_titles=[]
home_dates=[]
home_texts=[]
home_images=[]
home_parent=[]
home_introduced=[]

#newly created dataset within a task
introduced_names=[]
introduced_dates=[]
introduced_tasks=[]
introduced_texts=[]
introduced_images=[]
introduced_parent=[]
introduced_title=[]

big_break=False
for i,row in dataset_citing_papers_origins.iterrows():
    #the task can only be one that the dataset, not the dataset paper has been labeled with.
    valid_tasks= set(datasets[datasets.name==row['name']]['dataset_tasks'].iloc[0]+\
                     datasets[datasets.name==row['name']]['dataset_tasks_children'].iloc[0]+\
                    datasets[datasets.name==row['name']]['dataset_tasks_siblings'].iloc[0])
    for t in row['origin_tasks']:
        if t not in valid_tasks: continue
        
        introduced_names.append(row['name'])
        introduced_dates.append(row['introduced_date'])
        introduced_tasks.append(t)
        introduced_images.append(row['Images'])
        introduced_texts.append(row['Texts'])
        introduced_title.append(row['origin_title'])
        introduced_parent.append(False)
        if t not in task_contains_images: task_contains_images[t]=0
        if t not in task_contains_texts: task_contains_texts[t]=0
        task_contains_texts[t]+=row['Images']; task_contains_images[t]+=row['Texts']

        #Your parent gets credit for introducing datasets as well!
        for d in row['origin_parents']:
            if d not in valid_tasks: continue
            introduced_names.append(row['name'])
            introduced_dates.append(row['introduced_date'])
            introduced_tasks.append(d)
            introduced_images.append(row['Images'])
            introduced_texts.append(row['Texts'])
            introduced_parent.append(True)
            introduced_title.append(row['origin_title'])

            if d not in task_contains_images: task_contains_images[d]=0
            if d not in task_contains_texts: task_contains_texts[d]=0
            task_contains_texts[d]+=row['Images']; task_contains_images[d]+=row['Texts']
            

        #who are you passing it to?
        for d in row['all_tasks']:
            if d not in valid_tasks: continue
            #if d not in focal_tasks: continue
            #Scenario 1: Dest is sources parent
            if d in row['origin_parents']: continue
            #Scenario 2: Dest is sources child or another origins child:
            if d in row['origin_children']: continue
            #Scenario 3: Dest is source. (First confirm its not another origin)
            if d in row['origin_tasks'] and t!=d: continue
            #you've found yourself. Add to home task
            if t==d: 
                home_tasks.append(t)
                home_names.append(row['name'])
                home_titles.append(row['title'])
                home_dates.append(row['date'])
                home_images.append(row['Images'])
                home_texts.append(row['Texts'])
                home_parent.append(False)
                home_introduced.append(row['introduced_date'])
                if t not in task_contains_images: task_contains_images[t]=0
                if t not in task_contains_texts: task_contains_texts[t]=0
                task_contains_texts[t]+=row['Images']; task_contains_images[t]+=row['Texts']
                #your parents have also homgrown a task
                for parent in row['origin_parents']:
                    if parent not in valid_tasks: continue
                    home_tasks.append(parent)
                    home_names.append(row['name'])
                    home_titles.append(row['title'])
                    home_dates.append(row['date'])
                    home_images.append(row['Images'])
                    home_texts.append(row['Texts'])
                    home_parent.append(True)
                    home_introduced.append(row['introduced_date'])
                    if parent not in task_contains_images: task_contains_images[parent]=0
                    if parent not in task_contains_texts: task_contains_texts[parent]=0
                    task_contains_texts[parent]+=row['Images']; task_contains_images[parent]+=row['Texts']
            #you haven't found yourself and this is a transfer
            else:
                #A. pass directly to this tasks
                if t not in valid_tasks or d not in valid_tasks: continue
                sources.append(t)
                destinations.append(d)
                ds_names.append(row['name'])
                paper_titles.append(row['title'])
                dest_dates.append(row['date'])
                ds_texts.append(row['Texts'])
                ds_images.append(row['Images'])
                parent_transfer.append(False)
                if t not in task_contains_images: task_contains_images[t]=0
                if t not in task_contains_texts: task_contains_texts[t]=0
                task_contains_texts[t]+=row['Images']; task_contains_images[t]+=row['Texts']
                if d not in task_contains_images: task_contains_images[d]=0
                if d not in task_contains_texts: task_contains_texts[d]=0
                task_contains_texts[d]+=row['Images']; task_contains_images[d]+=row['Texts'] 
                #B1. Source has no parents but dest does
                for pt in row['origin_parents']:
                    if t not in valid_tasks or pt not in valid_tasks: continue
                    if t==pt: continue #cant transfer to yourself (this only occurs with self-loops)                        
                    for pdest in row['all_parents']:
                        if t not in valid_tasks or pdest not in valid_tasks: continue
                        #cant transfer to yourself to your own parent or children
                        #this is a simplification because in reality, you can transfer to your children but we lack the resolution to resolve that
                        if t==pdest or pdest in row['origin_parents'] or pdest in row['origin_children']: continue
                        sources.append(pt)
                        destinations.append(pdest)
                        ds_names.append(row['name'])
                        paper_titles.append(row['title'])
                        dest_dates.append(row['date'])
                        ds_texts.append(row['Texts'])
                        ds_images.append(row['Images'])
                        parent_transfer.append(True)
                        if pt not in task_contains_images: task_contains_images[pt]=0
                        if pt not in task_contains_texts: task_contains_texts[pt]=0
                        task_contains_texts[pt]+=row['Images']; task_contains_images[pt]+=row['Texts']
                        if pdest not in task_contains_images: task_contains_images[pdest]=0
                        if pdest not in task_contains_texts: task_contains_texts[pdest]=0
                        task_contains_texts[pdest]+=row['Images']; task_contains_images[pdest]+=row['Texts']

source_dest_edgelist=pd.DataFrame({'source_task':sources,'dest_task':destinations,'name':ds_names,'title':paper_titles,'date':dest_dates,'Images':ds_images,'Texts':ds_texts,'Parent_Transfer':parent_transfer}).drop_duplicates()
homegrown_edgelist=pd.DataFrame({'task':home_tasks,'name':home_names,'title':home_titles,'date':home_dates,'Images':home_images,'Texts':home_texts,'Parent':home_parent,'introduced_date':home_introduced}).drop_duplicates()
birth_edgelist=pd.DataFrame({'task':introduced_tasks,'name':introduced_names,'title':introduced_title,'date':introduced_dates,'Images':introduced_images,'Texts':introduced_texts,'Parent':introduced_parent}).drop_duplicates()
task_contains_images=pd.Series(task_contains_images)
task_contains_images[task_contains_images>0]=1
task_contains_texts=pd.Series(task_contains_texts)
task_contains_texts[task_contains_texts>0]=1
task_contains_images=task_contains_images.reset_index()
task_contains_images.columns=['task','Images']
task_contains_texts=task_contains_texts.reset_index()
task_contains_texts.columns=['task','Texts']
source_dest_edgelist.to_csv('source_dest_edgelist.csv',quoting=1)
homegrown_edgelist.to_csv('homegrown_edgelist',quoting=1)
birth_edgelist.to_csv('birth_edgelist.csv',quoting=1)


# RQ 3
Because the above block is so slow, you can uncommentThis block reloads the dataset and saves it for the Affiliation analysis in another 

In [None]:
#source_dest_edgelist=pd.read_csv('source_dest_edgelist.csv')
#homegrown_edgelist=pd.read_csv('source_dest_edgelist.csv')
#birth_edgelist=pd.read_csv('birth_edgelist.csv')
dataset_papers=datasets_pwc[['name','title']]
dataset_papers['date']=None
dest_papers=source_dest_edgelist[['name','title','date']]
dest_papers.columns=['name','title','date']
birth_papers=birth_edgelist[['name','title','date']]
homegrown_papers=homegrown_edgelist[['name','title','date']]
full_dataset=pd.concat([dataset_papers,dest_papers,birth_papers,homegrown_papers]).drop_duplicates()
print("Number of datasets born and then used at least once: ",birth_edgelist['name'].drop_duplicates().shape[0])
print("Total number of unique usages within introducing paper task: ", homegrown_edgelist[['name','title']].drop_duplicates().shape[0])
print("Total number of unique usages from outside paper introducing task: ", source_dest_edgelist[['name','title']].drop_duplicates().shape[0])
print("Total number of dataset-usages: ", full_dataset[~full_dataset.date.isnull()][['name','title']].drop_duplicates().shape[0])
print("Total number of dataset-using papers: ", full_dataset[~full_dataset.date.isnull()][['title']].drop_duplicates().shape[0])
print("Total number of datasets: ", full_dataset[['name']].drop_duplicates().shape[0])
print("Total number of tasks involved: ",pd.concat([birth_edgelist['task'],homegrown_edgelist['task'],
           source_dest_edgelist['source_task'],source_dest_edgelist['dest_task']]).drop_duplicates().shape[0])
full_dataset.to_csv('./PWC_Data/Derivative_Datasets/ValidPaperDataset-Titles.txt')


These two figures appear in the appendix as summaries of the dataset...

In [None]:
%matplotlib inline
temp=full_dataset.groupby('name').size().sort_values()
dataset_usage_dist=temp[(temp>5)&(temp<500)].plot(kind='hist',bins=500,figsize=[8,4],title='Truncated Distribution of Dataset Usages')
fig = dataset_usage_dist.get_figure()
fig.savefig('truncated_dist.png')


In [None]:
annual_size=pwc_papers.groupby('year').size().reset_index()
temp=annual_size.plot(figsize=[8,4],x='year',y=0,title='PWC Corpus papers per year',xlim=[2009,2020],legend=False)
fig = temp.get_figure()
pwc_papers.title.drop_duplicates().shape

In [None]:
MAGIDs=pd.read_json('MAG_Linking_IDs.json')
#MAGIDs=MAGIDs[['MAGID','PWC_Clean_Title','PWC_Title']]
dataset_papers_MAG=pd.merge(MAGIDs,birth_papers.drop_duplicates(),left_on='PWC_Title',right_on='title')


Because we were interested in tasks labeled under the Methodology section and removed these tasks which were anomolous compared to other methods.

In [None]:
Methodologies_to_Drop=[
'Word Embeddings',
'Anomaly Detection',
'Multivariate Time Series Forecasting',
'EEG',
'Chatbot',
'Computed Tomography (CT)',
'Electrocardiography (ECG)',
'Electrocardiography (ECG)',
'Multi-Label Text Classification'    
]

In [None]:
source_dest

# RQ2: Creating ratio datasets

Below are three largely similar blocks that create ratio datasets:
    
1. Calculates ratios for transfers between parent tasks, aggregated across all years (Figure 2)
2. Calculates ratios for transfers between parent tasks, disaggregated by year (Figure 1)
3. Calculates ratios for transfers between all tasks, disaggregated by year (not shown)

We do not use 3 because it double counts transfers.

In [None]:
#Get all tasks that are a parent of some other task
parent_tasks=[i for i in parent_child_dict if len(parent_child_dict[i])!=0]
parent_tasks=median_parent_tasks
source_dest_edgelist_parents= source_dest_edgelist[(source_dest_edgelist.source_task.isin(parent_tasks)) &\
                                                  (source_dest_edgelist.dest_task.isin(parent_tasks))]
homegrown_edgelist_parents= homegrown_edgelist[homegrown_edgelist.task.isin(parent_tasks)]
birth_edgelist_parents= birth_edgelist[birth_edgelist.task.isin(parent_tasks)]
print("Number of datasets born and then used at least once: ",birth_edgelist_parents['name'].drop_duplicates().shape[0])
print("Total number of unique usages within introducing paper task: ", homegrown_edgelist_parents[['name','title']].drop_duplicates().shape[0])
print("Total number of unique usages from outside paper introducing task: ", source_dest_edgelist_parents[['name','title']].drop_duplicates().shape[0])
print("Total number of parent tasks involved: ",pd.concat([birth_edgelist_parents['task'],homegrown_edgelist_parents['task'],
           source_dest_edgelist_parents['source_task'],source_dest_edgelist_parents['dest_task']]).drop_duplicates().shape[0])
print("Total number of papers involved: ",pd.concat([birth_edgelist_parents['title'],homegrown_edgelist_parents['title'],
           source_dest_edgelist_parents['title'],source_dest_edgelist_parents['title']]).drop_duplicates().shape[0])

num_papers_adopting=source_dest_edgelist_parents.groupby(['dest_task']).size() #counts the number of adopting papers within tasks
num_papers_adopting.index.names=['task']
num_papers_growing=homegrown_edgelist_parents.groupby(['task']).size() #counts the number of papers that use a "homegrown" dataset within tasks
num_dataset_births=birth_edgelist_parents.groupby(['task']).size() #number of datasets created within a task that are used at least once
#num_homegrown_datasets=num_dataset_births.shift(1).cumsum() #num datasets born in previous year
temp=source_dest_edgelist_parents.groupby(['dest_task','name']).size().reset_index().drop(0,axis=1)
num_dataset_imports=temp.groupby(['dest_task']).size()
#num_converted_growing=homegrown_edgelist_parents[(homegrown_edgelist_parents.introduced_date.dt.year==homegrown_edgelist_parents.date.dt.year)|\
#                                         (homegrown_edgelist_parents.introduced_date.dt.year==homegrown_edgelist_parents.date.dt.year-1)|\
#                                        (homegrown_edgelist_parents.introduced_date.dt.year==homegrown_edgelist_parents.date.dt.year-2)]
#num_converted_growing_year=num_converted_growing.groupby(['task',num_converted_growing.date.dt.year]).size()
#num_papers_growing_year=homegrown_edgelist_parents.groupby(['task',homegrown_edgelist_parents.date.dt.year]).size()
#conversion_pct
num_dataset_imports.columns=['imported_datasets']
num_dataset_imports.index.names=['task']
num_dataset_births.name='num_dataset_births'
num_papers_adopting.name='num_papers_adopting'
num_dataset_imports.name='num_dataset_imports'
num_papers_growing.name='num_papers_growing'
#num_homegrown_datasets.name='num_dataset_homegrown'
#num_converted_growing.name='num_converted_growing'

full_data=pd.merge(num_dataset_births.reset_index(),num_papers_adopting.reset_index(),how='outer')
#full_data=pd.merge(full_data,num_homegrown_datasets.reset_index(),how='outer')
full_data=pd.merge(full_data,num_dataset_imports.reset_index(),how='outer')
full_data=pd.merge(full_data,num_papers_growing.reset_index(),how='outer')
#full_data=pd.merge(full_data,num_converted_growing.reset_index(),how='outer')
full_data=full_data.fillna(0)
full_data['size']=full_data.num_papers_adopting+full_data.num_papers_growing+full_data.num_dataset_births
task_age_df=task_age.reset_index()
task_age_df.columns=['task','task_age']
full_data=pd.merge(full_data,task_age_df,on='task',how='left')

full_data['adoption_ratio']=full_data.num_papers_adopting.divide(full_data.num_papers_growing)
full_data['creation_ratio']=full_data.num_dataset_births.divide(full_data.num_dataset_imports)
#same as above but adding the numerator to the denominator to add stability
full_data['adoption_pct']=full_data.num_papers_adopting.divide(full_data.num_papers_adopting+full_data.num_papers_growing)
full_data['creation_pct']=full_data.num_dataset_births.divide(full_data.num_dataset_births+full_data.num_dataset_imports)
#full_data=pd.merge(full_data,task_contains_images,on='task',how='left')
#full_data=pd.merge(full_data,task_contains_texts,on='task',how='left')

def in_category(x,cat):
    if x in task_category_dict and cat in task_category_dict[x]:
        return 1
    if x in parent_child_dict and any([cat in task_category_dict[p] for p in child_parent_dict[x]]):
        return 1
    return 0
full_data['CV']=full_data['task'].apply(lambda x: in_category(x,'Computer Vision'))
full_data['NLP']=full_data['task'].apply(lambda x: in_category(x,'Natural Language Processing'))
full_data['Methodology']=full_data['task'].apply(lambda x: in_category(x,'Methodology'))
full_data['Methodology']=full_data['Methodology'].apply(lambda x: 0 if x in Methodologies_to_Drop else x)
median_parent_task_size=full_data['size'].median()
full_data=full_data[full_data['size']>median_parent_task_size]
print("Median parent task size: ",median_parent_task_size)
print("Number of tasks for final analysis (Figure 2): ",full_data['task'].unique().shape[0])
median_parent_tasks=full_data.task
median_parent_tasks.to_csv('median_parent_tasks')

full_data.to_csv("./PWC_Data/Derivative_Datasets/FullDatasetforR.ParentsOnly.AllYears.txt",sep='\t',quoting=1)


This block imposes no restrictions on tasks but if we're not using only parent transfers, that means there can be double counts...
It's not used in the paper but the results are similar (CHECK TO MAKE SURE)

In [None]:
num_papers_adopting=source_dest_edgelist.groupby(['dest_task',source_dest_edgelist.date.dt.year]).size()
num_papers_adopting.index.names=['task','date']
num_papers_growing=homegrown_edgelist.groupby(['task',homegrown_edgelist.date.dt.year]).size()
num_dataset_births=birth_edgelist.groupby(['task', birth_edgelist.date.dt.year]).size()
num_homegrown_datasets=num_dataset_births.shift(1).cumsum()
temp=source_dest_edgelist.groupby(['dest_task','name',source_dest_edgelist.date.dt.year]).size().reset_index().drop(0,axis=1)
num_dataset_imports=temp.groupby(['dest_task',source_dest_edgelist.date.dt.year]).size()
num_converted_growing=homegrown_edgelist[(homegrown_edgelist.introduced_date.dt.year==homegrown_edgelist.date.dt.year)|\
                                         (homegrown_edgelist.introduced_date.dt.year==homegrown_edgelist.date.dt.year-1)]
num_converted_growing=num_converted_growing.groupby(['task',num_converted_growing.date.dt.year]).size()

num_dataset_imports.columns=['imported_datasets']
num_dataset_imports.index.names=['task','date']
num_dataset_births.name='num_dataset_births'
num_papers_adopting.name='num_papers_adopting'
num_dataset_imports.name='num_dataset_imports'
num_papers_growing.name='num_papers_growing'
num_homegrown_datasets.name='num_dataset_homegrown'
num_converted_growing.name='num_converted_growing'
annual_data=pd.merge(num_dataset_births.reset_index(),num_papers_adopting.reset_index(),how='outer')
annual_data=pd.merge(annual_data,num_homegrown_datasets.reset_index(),how='outer')
annual_data=pd.merge(annual_data,num_dataset_imports.reset_index(),how='outer')
annual_data=pd.merge(annual_data,num_papers_growing.reset_index(),how='outer')
annual_data=pd.merge(annual_data,num_converted_growing.reset_index(),how='outer')
annual_data=annual_data.fillna(0)
annual_data['size']=annual_data.num_papers_adopting+annual_data.num_papers_growing+annual_data.num_dataset_births
task_age_df=task_age.reset_index()
task_age_df.columns=['task','task_age']
annual_data=pd.merge(annual_data,task_age_df,on='task',how='left')
annual_data['adoption_ratio']=annual_data.num_papers_adopting.divide(annual_data.num_papers_growing)
annual_data['creation_ratio']=annual_data.num_dataset_births.divide(annual_data.num_dataset_imports)
#annual_data['conversion_ratio']=annual_data.num_dataset_homegrown.divide(annual_data.num_papers_growing)
annual_data['adoption_pct']=annual_data.num_papers_adopting.divide(annual_data.num_papers_adopting+annual_data.num_papers_growing)
annual_data['creation_pct']=annual_data.num_dataset_births.divide(annual_data.num_dataset_births+annual_data.num_dataset_imports)
annual_data['conversion_pct']=annual_data.num_converted_growing.divide(annual_data.num_papers_growing)
#annual_data=pd.merge(annual_data,task_contains_images,on='task',how='left')
#annual_data=pd.merge(annual_data,task_contains_texts,on='task',how='left')
pwc_papers['year']=pd.to_datetime(pwc_papers['date']).dt.year
annual_size=pwc_papers.groupby('year').size().reset_index()
annual_data=pd.merge(annual_data,annual_size,left_on='date',right_on='year',how='left')
annual_data=annual_data.drop('year',axis=1).rename({0:'pwc_size'},axis=1)
annual_data.rename({'date':'year'},axis=1,inplace=True)

def in_category(x,cat):
    if x in task_category_dict and cat in task_category_dict[x]:
        return 1
    if x in parent_child_dict and any([cat in task_category_dict[p] for p in child_parent_dict[x]]):
        return 1
    return 0
annual_data['CV']=annual_data['task'].apply(lambda x: in_category(x,'Computer Vision'))
annual_data['NLP']=annual_data['task'].apply(lambda x: in_category(x,'Natural Language Processing'))
annual_data['Methodology']=annual_data['task'].apply(lambda x: in_category(x,'Methodology'))
annual_data['Methodology']=annual_data['Methodology'].apply(lambda x: 0 if x in Methodologies_to_Drop else x)
annual_data=annual_data[annual_data.year>=annual_data.task_age]
annual_data.to_csv("./PWC_Data/Derivative_Datasets/FullDatasetforR.txt",sep='\t',quoting=1)

This block allows you to print out histograms of dataset usages by task. It's not presented in the paper, but may be interesting to readers. It was used to select interesting cases for Figure 4.

In [None]:
#annual_data[['task','date','size','task_age','pwc_size']]
import os

import matplotlib.pyplot as plt
#This is just a subset 
methods_tasks=pd.read_csv("MethodTasksfromEmily.txt",header=None).squeeze().tolist()
outpath='/mnt/c/Users/berna/Documents/GoogleDataProject/ImportPlots/'
completed=os.listdir('/mnt/c/Users/berna/Documents/GoogleDataProject/ImportPlots/tasks')

modality_dict=datasets[['name','modalities']]
modality_dict['modes']=modality_dict['modalities'].apply(lambda x: x if x!= [] else ['Unknown'])
modality_dict=modality_dict.drop('modalities',axis=1).set_index('name').squeeze()

for p in median_parent_tasks:
    print(p)
    out_name=p.replace('/','_').replace(' ','_')
    '''
    if out_name+'.jpg' in completed:
        print("COMPLETED")
        continue
    '''
    datasets_borrow=source_dest_edgelist_parents[source_dest_edgelist_parents.dest_task==p].drop_duplicates(['name','title']).groupby('name').size().sort_values().to_frame()
    datasets_borrow['source']='blue'
    datasets_homegrown=homegrown_edgelist_parents[homegrown_edgelist_parents.task==p].drop_duplicates(['name','title']).groupby('name').size().sort_values().to_frame()
    datasets_homegrown['source']='orange'
    dataset_df=pd.concat([ datasets_borrow,datasets_homegrown]).reset_index().sort_values(0,ascending=False)
    if len(dataset_df)==0:continue
    dataset_df=dataset_df.rename({0:'count'},axis=1)
    d=dataset_df.plot.barh(x='name',y='count',color=dataset_df['source'],figsize=(20, 20)).figure.savefig(outpath+'datasets/'+out_name+'.jpg')
    plt.clf();plt.close()
    if p in methods_tasks:
            d=dataset_df.plot.barh(x='name',y='count',color=dataset_df['source'],figsize=(20, 20)).figure.savefig(outpath+'methods_datasets/'+out_name+'.jpg')
            plt.clf();plt.close()
    tasks_borrow=source_dest_edgelist_parents[source_dest_edgelist_parents.dest_task==p].drop_duplicates(['name','title']).groupby('source_task').size().sort_values().to_frame()
    tasks_borrow['source']='blue'
    tasks_homegrown=homegrown_edgelist_parents[homegrown_edgelist_parents.task==p].drop_duplicates(['name','title']).groupby('task').size().sort_values().to_frame()
    tasks_homegrown['source']='orange'
    tasks_df=pd.concat([ tasks_borrow,tasks_homegrown]).reset_index().sort_values(0,ascending=False)
    tasks_df=tasks_df.rename({0:'count','index':'task'},axis=1)
    t=tasks_df.plot.barh(x='task',y='count',color=tasks_df['source'],figsize=(20, 20)).figure.savefig(outpath+'tasks/'+out_name+'.jpg')
    plt.clf();plt.close()
    if p in methods_tasks:
            t=tasks_df.plot.barh(x='task',y='count',color=tasks_df['source'],figsize=(20, 20)).figure.savefig(outpath+'methods_tasks/'+out_name+'.jpg')
            plt.clf();plt.close()
    if p not in methods_tasks: continue
    modalities=[]
    for i,row in dataset_df.iterrows(): modalities+=modality_dict[row['name']]*row['count']
    modalities=pd.Series(modalities).value_counts().sort_values(ascending=False)
    m=modalities.plot.barh(figsize=(20, 20)).figure.savefig(outpath+'methods_modalities/'+out_name+'.jpg')
    plt.clf();plt.close()

# Figure 4: Pie charts for Figure 4

The following blocks create the pie charts for figure 4...

In [None]:
p='Image Generation'
datasets_borrow=source_dest_edgelist[source_dest_edgelist.dest_task==p].drop_duplicates(['name','title']).groupby('name').size().sort_values().to_frame()
datasets_borrow['source']='blue'
datasets_homegrown=homegrown_edgelist[homegrown_edgelist.task==p].drop_duplicates(['name','title']).groupby('name').size().sort_values().to_frame()
datasets_homegrown['source']='orange'
dataset_df=pd.concat([ datasets_borrow,datasets_homegrown]).reset_index().sort_values(0,ascending=False)
dataset_df=dataset_df.rename({0:'count'},axis=1)

tasks_borrow=source_dest_edgelist[source_dest_edgelist.dest_task==p].drop_duplicates(['name','title']).groupby('source_task').size().sort_values().to_frame()
tasks_borrow['source']='blue'
tasks_homegrown=homegrown_edgelist[homegrown_edgelist.task==p].drop_duplicates(['name','title']).groupby('task').size().sort_values().to_frame()
tasks_homegrown['source']='orange'
tasks_df=pd.concat([ tasks_borrow,tasks_homegrown]).reset_index().sort_values(0,ascending=False)
tasks_df=tasks_df.rename({0:'count','index':'task'},axis=1)

dataset_df['cumulative']=dataset_df.sort_values('count',ascending=False)['count'].cumsum()/dataset_df.sort_values('count',ascending=False)['count'].sum()
#tasks_df.set_index('task').plot.pie(y='count')
other_count=dataset_df[dataset_df['cumulative']>.90]['count'].sum()
dataset_df=dataset_df[dataset_df['cumulative']<.90]
dataset_df=dataset_df.append({'name':'Other','count':other_count,'cumulative':1,'source':'gray'},ignore_index=True)

tasks_df['cumulative']=tasks_df.sort_values('count',ascending=False)['count'].cumsum()/tasks_df.sort_values('count',ascending=False)['count'].sum()
#tasks_df.set_index('task').plot.pie(y='count')
other_count=tasks_df[tasks_df['cumulative']>.90]['count'].sum()
tasks_df=tasks_df[tasks_df['cumulative']<.90]
tasks_df=tasks_df.append({'task':'Other','count':other_count,'cumulative':1,'source':'gray'},ignore_index=True)

In [None]:
import plotly.graph_objects as go
import plotly.express as px
fig = go.Figure(data=[go.Pie(labels=tasks_df['task'], 
                             values=tasks_df['count'],
                            pull=[0, 0, 0, 0.0,0.2,0,0])])
fig.update_layout(
    font_family="Arial",
    title_font_family="Arial",
    font_color='black',
)
fig.update_traces(marker=dict(colors=px.colors.qualitative.Set1,line=dict(color='#000000', width=1)),textfont_color='black')
fig.show()
fig.write_image("/mnt/c/Users/berna/Documents/GoogleDataProject/ImportPlots/ImageGenerationTasks.svg")
fig = go.Figure(data=[go.Pie(labels=dataset_df['name'], 
                             values=dataset_df['count'],
                            pull=[0, 0, 0, 0.0,0,0.2,0])])
fig.update_layout(
    font_family="Arial",
    title_font_family="Arial",
    font_color='black',
)
fig.update_traces(marker=dict(colors=px.colors.qualitative.Set3,line=dict(color='#000000', width=1)),textfont_color='black')
#fig.write_image("ImageGenDatasets.svg")
fig.show()
fig.write_image("/mnt/c/Users/berna/Documents/GoogleDataProject/ImportPlots/ImageGenerationDatasets.svg")

In [None]:
p='Face Recognition'
datasets_borrow=source_dest_edgelist[source_dest_edgelist.dest_task==p].drop_duplicates(['name','title']).groupby('name').size().sort_values().to_frame()
datasets_borrow['source']='blue'
datasets_homegrown=homegrown_edgelist[homegrown_edgelist.task==p].drop_duplicates(['name','title']).groupby('name').size().sort_values().to_frame()
datasets_homegrown['source']='orange'
dataset_df=pd.concat([ datasets_borrow,datasets_homegrown]).reset_index().sort_values(0,ascending=False)
dataset_df=dataset_df.rename({0:'count'},axis=1)

tasks_borrow=source_dest_edgelist[source_dest_edgelist.dest_task==p].drop_duplicates(['name','title']).groupby('source_task').size().sort_values().to_frame()
tasks_borrow['source']='blue'
tasks_homegrown=homegrown_edgelist[homegrown_edgelist.task==p].drop_duplicates(['name','title']).groupby('task').size().sort_values().to_frame()
tasks_homegrown['source']='orange'
tasks_df=pd.concat([ tasks_borrow,tasks_homegrown]).reset_index().sort_values(0,ascending=False)
tasks_df=tasks_df.rename({0:'count','index':'task'},axis=1)

dataset_df['cumulative']=dataset_df.sort_values('count',ascending=False)['count'].cumsum()/dataset_df.sort_values('count',ascending=False)['count'].sum()
#tasks_df.set_index('task').plot.pie(y='count')
other_count=dataset_df[dataset_df['cumulative']>.85]['count'].sum()
dataset_df=dataset_df[dataset_df['cumulative']<.85]
dataset_df=dataset_df.append({'name':'Other','count':other_count,'cumulative':1,'source':'gray'},ignore_index=True)

tasks_df['cumulative']=tasks_df.sort_values('count',ascending=False)['count'].cumsum()/tasks_df.sort_values('count',ascending=False)['count'].sum()
#tasks_df.set_index('task').plot.pie(y='count')
other_count=tasks_df[tasks_df['cumulative']>.85]['count'].sum()
tasks_df=tasks_df[tasks_df['cumulative']<.85]
tasks_df=tasks_df.append({'task':'Other','count':other_count,'cumulative':1,'source':'gray'},ignore_index=True)

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Pie(labels=tasks_df['task'], 
                             values=tasks_df['count'],
                            pull=[0, 0, 0, 0.2,0,0,0])])
fig.update_layout(
    font_family="Arial",
    title_font_family="Arial",
    font_color='black',
)
fig.update_traces(marker=dict(colors=px.colors.qualitative.Set1,line=dict(color='#000000', width=1)),textfont_color='black')
fig.show()
temp=[i for i in px.colors.qualitative.Set1]
temp[3]='rgb(255,255,,255)'
temp[4]='rgb(255,255,255)'
temp
fig = go.Figure(data=[go.Pie(labels=dataset_df['name'], 
                             values=dataset_df['count'],
                            pull=[0, 0, 0,.2,.2,0,0])])
fig.update_layout(
    font_family="Arial",
    title_font_family="Arial",
    font_color='black',
)
fig.update_traces(marker=dict(colors=temp,line=dict(color='#000000', width=1)),textfont_color='black')
fig.show()
fig.write_image("/mnt/c/Users/berna/Documents/GoogleDataProject/ImportPlots/FaceRecognitionDatasets.svg")

In [None]:
%matplotlib inline
tasks_df['cumulative']=tasks_df.sort_values('count',ascending=False)['count'].cumsum()/tasks_df.sort_values('count',ascending=False)['count'].sum()
#tasks_df.set_index('task').plot.pie(y='count')
other_count=tasks_df[tasks_df['cumulative']>.99]['count'].sum()
tasks_df=tasks_df[tasks_df['cumulative']<.99]
tasks_df=tasks_df.append({'task':'Other','count':other_count,'cumulative':1,'source':'gray'},ignore_index=True)

In [None]:
dataset_df['cumulative']=dataset_df.sort_values('count',ascending=False)['count'].cumsum()/dataset_df.sort_values('count',ascending=False)['count'].sum()
#tasks_df.set_index('task').plot.pie(y='count')
other_count=dataset_df[dataset_df['cumulative']>.99]['count'].sum()
dataset_df=dataset_df[dataset_df['cumulative']<.99]
dataset_df=dataset_df.append({'name':'Other','count':other_count,'cumulative':1,'source':'gray'},ignore_index=True)
dataset_df

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Pie(labels=tasks_df['task'], 
                             values=tasks_df['count'],
                            pull=[0, 0, 0, 0.2,0,0,0])])
fig.update_layout(
    font_family="Arial",
    title_font_family="Arial",
    font_color='black',
)
fig.update_traces(marker=dict(colors=px.colors.qualitative.Set1,line=dict(color='#000000', width=1)),textfont_color='black')
fig.show()

fig = go.Figure(data=[go.Pie(labels=dataset_df['name'], 
                             values=dataset_df['count'],
                            pull=[0, 0, 0, 0.2,0,0,0])])
fig.update_layout(
    font_family="Arial",
    title_font_family="Arial",
    font_color='black',
)
fig.update_traces(marker=dict(colors=px.colors.qualitative.Set1,line=dict(color='#000000', width=1)),textfont_color='black')
fig.show()

# Create Gini Data for RQ 1
This last section of code creates data for the Gini analyses

In [None]:
dataset_name=[]
paper_title=[]
paper_date=[]
paper_tasks=[]
paper_CV=[]
paper_NLP=[]
paper_Methods=[]
paper_parent=[]
def in_category(x,cat):
    if x in task_category_dict and cat in task_category_dict[x]:
        return 1
    if x in parent_child_dict and any([cat in task_category_dict[p] for p in child_parent_dict[x]]):
        return 1
    return 0

for i, row in dataset_citing_papers_pwc.drop_duplicates(['name','title']).iterrows():
    #again we're skipping this dataset because there is some wonky labeling in PWC
    if row['name']=='PRID2011':continue
    #restrict ourselves to valid tasks
    valid_tasks= set(datasets[datasets.name==row['name']]['dataset_tasks'].iloc[0]+\
                     datasets[datasets.name==row['name']]['dataset_tasks_children'].iloc[0]+\
                     datasets[datasets.name==row['name']]['dataset_tasks_siblings'].iloc[0]
                    )
    for t in row['all_tasks']:
        if t not in valid_tasks: continue 
        #if t not in focal_tasks:continue
        dataset_name.append(row['name'])
        paper_title.append(row['title'])
        paper_date.append(row['date'].year)
        paper_tasks.append(t)
        paper_parent.append(False)
    for t in row['all_parents']:
        if t not in valid_tasks: continue
        #if t not in focal_tasks:continue
        dataset_name.append(row['name'])
        paper_title.append(row['title'])
        paper_date.append(row['date'].year)
        paper_tasks.append(t)
        paper_parent.append(True)
entropy_dataset=pd.DataFrame({'task':paper_tasks,'name':dataset_name,'title':paper_title,'date':paper_date}).drop_duplicates()
print("Number of tasks: ",entropy_dataset.task.drop_duplicates().shape[0])
print("Number of datasets: ",entropy_dataset.name.drop_duplicates().shape[0])
print("Number of papers: ",entropy_dataset.title.drop_duplicates().shape[0])
entropy_dataset.to_csv('./EntropyDataset.txt')

In [None]:
valid_tasks= set(datasets_pwc[datasets_pwc.name==row['name']]['dataset_tasks'].iloc[0]+\
                 datasets_pwc[datasets_pwc.name==row['name']]['dataset_tasks_children'].iloc[0]+\
                 datasets_pwc[datasets_pwc.name==row['name']]['dataset_tasks_siblings'].iloc[0]
                )

In [None]:
print("Number of usages recovered through manual annotation:", entropy_dataset[entropy_dataset.name.isin(manual_task_labels.name)].shape[0])
print("Number of usages still tossed:",entropy_dataset[entropy_dataset.name.isin(manual_tasks_not_labeled.name)].shape[0])

These are the metrics used in the analyses. Note we do not report the Pielou evenness, which is information-theoretic metric, but performs similary.

I also experimented with the Simpson index.

In [None]:
import numpy as np
from scipy.stats import entropy
from math import log
#right now I am just saying having 1 or 0 datasets is meaningless
def gini(x):
    if len(x)<2:return None
    x=np.array(x)
    """Compute Gini coefficient of array of values"""
    diffsum = 0
    for i, xi in enumerate(x[:-1], 1):
        diffsum += np.sum(np.abs(xi - x[i:]))
    return diffsum / (len(x)**2 * np.mean(x))

def corrected_gini(x):
    if len(x)<2:return None
    x=np.array(x)
    """Compute Gini coefficient of array of values"""
    diffsum = 0
    for i, xi in enumerate(x[:-1], 1):
        diffsum += np.sum(np.abs(xi - x[i:]))
    gini = diffsum / (len(x)**2 * np.mean(x))
    return len(x)*gini/(len(x)-1) 


def pielou(x):
    if len(x)<2:return None
    x=np.array(x)
    return entropy(x,base=2)/log(len(x),2)


Below is the entropy for all tasks, with no restrictions...

In [None]:
from skbio.diversity import alpha
task_ds_ycounts=entropy_dataset.groupby(['task','name',entropy_dataset.date]).size().reset_index()
task_ds_ycounts.columns=['task', 'name','year','count']
ginis=task_ds_ycounts.groupby(['task','year'])['count'].agg(corrected_gini).reset_index()
ginis.columns=['task','year','gini']
pielous=task_ds_ycounts.groupby(['task','year'])['count'].agg(pielou).reset_index()
pielous.columns=['task','year','pielou']
simpson=task_ds_ycounts.groupby(['task','year'])['count'].agg(alpha.simpson_e).reset_index()
simpson.columns=['task','year','simpson']
size=task_ds_ycounts.groupby(['task','year'])['count'].sum().reset_index()
size.columns=['task','year','task_size']
inequity_years_df=pd.merge(ginis,pielous,on=['task','year'])
inequity_years_df=pd.merge(inequity_years_df,simpson,on=['task','year'],how='left')
inequity_years_df=pd.merge(inequity_years_df,size,on=['task','year'],how='left')
inequity_years_df=pd.merge(inequity_years_df,
                           annual_data[['task','year','task_age','pwc_size','Images','Texts','CV','NLP','Methodology']],
                           on=['task','year'], how='left')
inequity_years_df['CV']=inequity_years_df['task'].apply(lambda x: in_category(x,'Computer Vision'))
inequity_years_df['NLP']=inequity_years_df['task'].apply(lambda x: in_category(x,'Natural Language Processing'))
inequity_years_df['Methodology']=inequity_years_df['task'].apply(lambda x: in_category(x,'Methodology'))
inequity_years_df['Methodology']=inequity_years_df['Methodology'].apply(lambda x: 0 if x in Methodologies_to_Drop else x)
inequity_years_df.to_csv("/mnt/c/Users/berna/Documents/./EntropyInputs/EntropyDatasetforR.txt",sep='\t',quoting=1)

In [None]:
inequity_years_df

This dataset is only for the tasks that are larger than the median size and are parent tasks

In [None]:
temp=entropy_dataset[entropy_dataset.task.isin(median_parent_tasks)]
print("Number of tasks (parent-tasks only): ",temp.task.drop_duplicates().shape[0])
print("Number of datasets (parent-tasks only): ",temp.name.drop_duplicates().shape[0])
print("Number of papers (parent-tasks only): ",temp.title.drop_duplicates().shape[0])

task_ds_ycounts=entropy_dataset.groupby(['task','name',entropy_dataset.date]).size().reset_index()
#this is the important line
task_ds_ycounts= task_ds_ycounts[task_ds_ycounts.task.isin(median_parent_tasks)]
#task_ds_ycounts= task_ds_ycounts[task_ds_ycounts.task.isin(parent_tasks)]

task_ds_ycounts.columns=['task', 'name','year','count']
ginis=task_ds_ycounts.groupby(['task','year'])['count'].agg(corrected_gini).reset_index()
ginis.columns=['task','year','gini']
pielous=task_ds_ycounts.groupby(['task','year'])['count'].agg(pielou).reset_index()
pielous.columns=['task','year','pielou']
simpson=task_ds_ycounts.groupby(['task','year'])['count'].agg(alpha.simpson_e).reset_index()
simpson.columns=['task','year','simpson']
size=task_ds_ycounts.groupby(['task','year'])['count'].sum().reset_index()
size.columns=['task','year','task_size']
inequity_years_df=pd.merge(ginis,pielous,on=['task','year'])
inequity_years_df=pd.merge(inequity_years_df,simpson,on=['task','year'],how='left')
inequity_years_df=pd.merge(inequity_years_df,size,on=['task','year'],how='left')
task_age_df=task_age.reset_index()
task_age_df.columns=['task','task_age']
inequity_years_df=pd.merge(inequity_years_df,task_age_df,on='task',how='left')
pwc_papers['year']=pd.to_datetime(pwc_papers['date']).dt.year
annual_size=pwc_papers.groupby('year').size().reset_index()
annual_size.columns=['year','pwc_size']
inequity_years_df=pd.merge(inequity_years_df,annual_size,on='year',how='left')
inequity_years_df['CV']=inequity_years_df['task'].apply(lambda x: in_category(x,'Computer Vision'))
inequity_years_df['NLP']=inequity_years_df['task'].apply(lambda x: in_category(x,'Natural Language Processing'))
inequity_years_df['Methodology']=inequity_years_df['task'].apply(lambda x: in_category(x,'Methodology'))
inequity_years_df['Methodology']=inequity_years_df['Methodology'].apply(lambda x: 0 if x in Methodologies_to_Drop else x)
inequity_years_df.to_csv("/mnt/c/Users/berna/Documents/GoogleDataProject/EntropyInputs/EntropyDatasetforRParentsOnly.txt",sep='\t',quoting=1)

In [None]:
temp=entropy_dataset[entropy_dataset.task.isin(median_parent_tasks)]

temp[['name','title']].drop_duplicates()

datasets_w_authors=pd.read_csv("/mnt/c/Users/berna/Documents/GitHub/Life_of_a_Benchmark/Dataset_Curation/numdatasets.txt")
temp=entropy_dataset[entropy_dataset.name.isin(datasets_w_authors.Dataset_Name)]
print("Number of tasks (parent-tasks only): ",temp.task.drop_duplicates().shape[0])
print("Number of datasets (parent-tasks only): ",temp.name.drop_duplicates().shape[0])
print("Number of papers (parent-tasks only): ",temp.title.drop_duplicates().shape[0])


In [None]:
temp[['name','title']].drop_duplicates()



In [None]:

parent_tasks=[i for i in parent_child_dict if len(parent_child_dict[i])!=0]
task_ds_counts=entropy_dataset.groupby(['task','name']).size().reset_index()
task_ds_counts= task_ds_counts[task_ds_counts.task.isin(median_parent_tasks)]
task_ds_counts.columns=['task', 'name','count']
ginis=task_ds_counts.groupby(['task'])['count'].agg(corrected_gini).reset_index()
ginis.columns=['task','gini']
pielous=task_ds_counts.groupby(['task'])['count'].agg(pielou).reset_index()
pielous.columns=['task','pielou']
simpson=task_ds_counts.groupby(['task'])['count'].agg(alpha.simpson_e).reset_index()
simpson.columns=['task','simpson']
size=task_ds_counts.groupby(['task'])['count'].sum().reset_index()
size.columns=['task','task_size']
inequity_df=pd.merge(ginis,pielous,on=['task'])
inequity_df=pd.merge(inequity_df,simpson,on=['task'],how='left')
inequity_df=pd.merge(inequity_df,size,on=['task'],how='left')
inequity_df['CV']=inequity_df['task'].apply(lambda x: in_category(x,'Computer Vision'))
inequity_df['NLP']=inequity_df['task'].apply(lambda x: in_category(x,'Natural Language Processing'))
inequity_df['Methodology']=inequity_df['task'].apply(lambda x: in_category(x,'Methodology'))
inequity_df.to_csv("/mnt/c/Users/berna/Documents/GoogleDataProject/EntropyInputs/EntropyDatasetforRParentsOnly.AllYears.txt",sep='\t',quoting=1)

In [None]:
inequity_df

In [None]:
task_ds_ycounts[(task_ds_ycounts.year==2015) & (task_ds_ycounts.task=='3D Human Pose Estimation')]