In [2]:
# Take the URL list and use it to construct a .csv input for the pilot

In [127]:
import pandas as pd
import numpy as np

In [128]:
url_fpath = "./pdb_urls.csv"

In [129]:
with open(url_fpath, 'r', encoding='utf-8') as f:
    url_list = f.readlines()

In [130]:
url_list = [url.strip() for url in url_list]

In [131]:
url_list[:5]

['https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958911.pdf',
 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958912.pdf',
 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958913.pdf',
 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958914.pdf',
 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958915.pdf']

In [132]:
# A batch consists of:
# (a) 13 image urls (url1, url2, ...)
# (b) a number of additional HITs, (num_tasks)
# (c) the bonus amount (N additional tasks for bonus_amt dollars)

In [133]:
len(url_list)

2484

In [134]:
num_batches = len(url_list) / 13
num_batches

191.07692307692307

In [135]:
split_indices = [n for n in range(13,2484,13)]

In [136]:
split_indices[:5]

[13, 26, 39, 52, 65]

In [137]:
url_batches = np.array_split(url_list, split_indices)

In [138]:
#full_batches = url_batches[:-1]
full_batches = url_batches[:-3]

In [139]:
len(full_batches)

189

In [140]:
batch_list = [{f'pdf{k+1}': url for k, url in enumerate(batch)} for batch in full_batches]

In [141]:
batch_list[0]

{'pdf1': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958911.pdf',
 'pdf2': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958912.pdf',
 'pdf3': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958913.pdf',
 'pdf4': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958914.pdf',
 'pdf5': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958915.pdf',
 'pdf6': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958916.pdf',
 'pdf7': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958918.pdf',
 'pdf8': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958920.pdf',
 'pdf9': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958922.pdf',
 'pdf10': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958924.pdf',
 'pdf11': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958926.pdf',
 'pdf12': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958929.pdf',
 'pdf13': 'https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005958931.pdf'}

In [142]:
batch_df = pd.DataFrame(batch_list)

In [143]:
batch_df.head(1)

Unnamed: 0,pdf1,pdf2,pdf3,pdf4,pdf5,pdf6,pdf7,pdf8,pdf9,pdf10,pdf11,pdf12,pdf13
0,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...


In [144]:
#wages = [50, 90, 98, 99, 100, 101, 102, 110, 150]
wages = ["$0.50", "$0.90", "$0.98", "$0.99", "$1.00", "$1.01", "$1.02",
         "$1.10", "$1.50"]

In [145]:
len(wages)

9

In [146]:
191 / 9

21.22222222222222

In [147]:
batch_wages = []
for cur_wage in wages:
    batch_wages.extend(21*[cur_wage])

In [148]:
len(batch_wages)

189

In [149]:
batch_df['offer_amt'] = batch_wages

In [150]:
batch_df['offer_amt'].value_counts()

$1.01    21
$1.10    21
$0.99    21
$1.50    21
$1.02    21
$0.90    21
$0.98    21
$0.50    21
$1.00    21
Name: offer_amt, dtype: int64

In [151]:
batch_df['additional_tasks'] = 12

In [152]:
batch_df.head()

Unnamed: 0,pdf1,pdf2,pdf3,pdf4,pdf5,pdf6,pdf7,pdf8,pdf9,pdf10,pdf11,pdf12,pdf13,offer_amt,additional_tasks
0,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,$0.50,12
1,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,$0.50,12
2,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,$0.50,12
3,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,$0.50,12
4,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,https://cs.stanford.edu/~jjacobs3/pdb/DOC_0005...,$0.50,12


In [153]:
batch_df['batch_id'] = 'p01'

In [155]:
# Final thing: just take first 100 batches

In [156]:
pilot_df = batch_df.iloc[:100].copy()

In [157]:
pilot_df.to_csv("pilot.csv", index=False)

In [158]:
len(pilot_df)

100