In [4]:
import pandas as pd
import os
from uuid import uuid4
import random
from sklearn.model_selection import train_test_split

# Create the annotations for the database

In [2]:
input_dir = 'INPUT_DIR'
output_dir = 'OUTPUT_DIR'
    
#set ouput path if not exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [20]:
# load data
ref_df = pd.read_csv(input_dir + '/db_ref.csv')
doc_df = pd.read_csv(input_dir + '/db_doc.csv')
cited_doc_df = pd.read_csv(input_dir + '/db_cited_doc.csv')

In [36]:
# filter ref_id (only one id per par)
par_ids = ref_df['par_id'].unique().tolist()
samples = []
for par_id in par_ids:
    ref_ids = ref_df[ref_df['par_id']==par_id]['id'].tolist()
    if len(ref_ids) == 1:
        samples.append(ref_ids[0])
    else:
        doc_ids = [cited_doc_df[cited_doc_df['ref_id']==id]['doc_id'].tolist() for id in ref_ids]
        rating = []
        for docs in doc_ids:
            rating_ = []
            for doc in docs:
                row = doc_df[doc_df['id']==doc]
                if row['title'].values[0] == 'unknown':
                    rating_.append(False)
                else:
                    rating_.append(True)
            rating.append(any(rating_))
        better_refs = [id for i, id in enumerate(ref_ids) if rating[i] ]
        if better_refs:
            samples.append(random.sample(better_refs, 1)[0])
        else:
            samples.append(random.sample(ref_ids, 1)[0])
len(samples)
    

1137

In [41]:
user_ids = {
    'user1':'5a5fb90d-8b63-41e0-9647-65aa975c3a82',
    'user2':'2f3aacef-d202-402d-b257-5c232b1c55c9',
    }
active_users = [v for k, v in user_ids.items() if k in ['user1', 'user2']]

user_pairs =[]
for i in range(len(active_users)):
    for j in range(i +1, len(active_users)):
        user_pairs.append([active_users[i],active_users[j]])
random.shuffle(user_pairs)

samples = random.sample(samples, len(samples))

q = round(len(samples)/4)

sample_1 = samples[:q]
sample_2 = samples[q:q*2]
sample_3 = samples[q*2:q*3]
sample_4 = samples[q*3:]


for nr, sample in enumerate([sample_1, sample_2, sample_3, sample_4]):
    ann_df = pd.DataFrame(columns=['user_id', 'ref_id', 'guideline_version'])
    t_1, t_2 = train_test_split(sample, test_size=0.1)
    for i,task in enumerate(t_1):
        ann_df.loc[len(ann_df)] = [active_users[i%len(active_users)], task, 2.0]
    for i, task in enumerate(t_2):
        for user in user_pairs[i%len(user_pairs)]:
            ann_df.loc[len(ann_df)] = [user, task, 2.0]
    ann_df.to_csv(output_dir + f'OUTPUT_FILE', index=False)