SSE Data Annotation
===


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import os
import re
import json
import sys
import pickle
from tqdm import tqdm

import sklearn
import sklearn.linear_model
import sklearn.preprocessing
from sklearn.pipeline import Pipeline

import dateutil.parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import pytz

In [None]:
# evaluation
from scipy.stats import rankdata

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig

In [None]:
config = cbrec.genconfig.Config()
#config.metadata_filepath += "_old"
#config.feature_db_filepath += "_old"

In [None]:
import cbrec.featuredb
import cbrec.utils
import cbrec.data
import cbrec.reccontext
import cbrec.evaluation
import cbrec.torchmodel
import cbrec.text.embeddingdb
import cbrec.text.journalid

In [None]:
import cbrec.logutils
cbrec.logutils.set_up_logging()

In [None]:
# turn off matplotlib logging
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths

In [None]:
# load in recommendations from previous rounds
participant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')

d = []
for batch_id in [0, 1, 2, 3]:
    participant_data_filepath = os.path.join(participant_data_dir, f'participant_rec_data_b{batch_id}.ndjson')
    with open(participant_data_filepath, 'r') as infile:
        for line in infile:
            participant = json.loads(line)
            del participant['site_scores']
            participant['batch_id'] = batch_id
            d.append(participant)

batch_df = pd.DataFrame(d)

participant_recced_site_map = {}
for participant_id, group in batch_df.groupby('participant_id'):
    recced_site_ids = []
    for sse_site_list in group.sse_site_list:
        recced_site_ids.extend([site['site_id'] for site in sse_site_list])
    assert len(recced_site_ids) == len(set(recced_site_ids)), "Duplicate rec was given."
    recced_site_ids = list(set(recced_site_ids))
    participant_recced_site_map[participant_id] = recced_site_ids
#participant_recced_site_map = {row.participant_id: [site['site_id'] for site in row.sse_site_list] for row in batch_df.itertuples()}
len(participant_recced_site_map)

In [None]:
batch_df.sample(n=10)

In [None]:
batch_df.batch_id.value_counts().sort_index()

In [None]:
sdf = batch_df[batch_df.batch_id == 0]
len(sdf)

In [None]:
participant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
b0_annotation_filepath = os.path.join(participant_data_dir, 'b0_sse_annotations.tsv')
b0_annotation_filepath

In [None]:
with open(b0_annotation_filepath, 'w') as outfile:
    outfile.write('batch_id\tparticipant_ids\trecced_count\tsite_id\tsite_title\tjournal_oid\tjournal_date\tjournal_title\tjournal_body\n')
    ds = []
    journal_oid_set = set()
    for row in sdf.itertuples():
        for sse_site in row.sse_site_list:
            journal_date = datetime.utcfromtimestamp(sse_site['journal_timestamp'] / 1000).isoformat()
            journal_oid = sse_site['journal_oid']
            if journal_oid in journal_oid_set:
                match_found = False
                for d in ds:
                    if d['journal_oid'] == journal_oid:
                        d['participant_ids'].append(row.participant_id)
                        match_found = True
                assert match_found
                continue
            else:
                journal_oid_set.add(journal_oid)
            d = {
                'batch_id': row.batch_id,
                'participant_ids': [row.participant_id,],
                'site_id': sse_site['site_id'], 
                'site_title': sse_site['site_title'],
                'journal_oid': sse_site['journal_oid'],
                'journal_date': journal_date,
                'journal_title': sse_site['cleaned_journal_title'],
                'journal_body': sse_site['cleaned_journal_body'],
            }
            ds.append(d)
    for d in ds:
        #outfile.write(f"{row.batch_id}\t{row.participant_id}\t{sse_site['site_id']}\t{sse_site['site_title']}\t{sse_site['journal_oid']}\t{journal_date}\t\"{sse_site['cleaned_journal_title']}\"\t\"{sse_site['cleaned_journal_body']}\"\n")
        outfile.write(f"{d['batch_id']}\t{', '.join([str(pid) for pid in d['participant_ids']])}\t{len(d['participant_ids'])}\t{d['site_id']}\t{d['site_title']}\t{d['journal_oid']}\t{d['journal_date']}\t\"{d['journal_title']}\"\t\"{d['journal_body']}\"\n")
len(ds)

In [None]:
!head b0_sse_annotations.tsv

In [None]:
sse_site