In [95]:
import datasets # hugging face datasets
import requests
import pandas as pd
from pathlib import Path
import os


In [99]:
# set paths

root = Path(os.getcwd()).parent
data = root / 'data'
processed = data / 'processed'


PosixPath('/Users/toby/Dev/maet-pln/data/processed')

In [105]:
datasets_list = datasets.list_datasets() # deprecation warning
datasets_list[:5]

['acronym_identification',
 'ade_corpus_v2',
 'UCLNLP/adversarial_qa',
 'aeslc',
 'afrikaans_ner_corpus']

In [13]:
# Throws an error - ignore for now as the API works fine
try: 
    dataset = datasets.load_dataset("openai/summarize_from_feedback", 'comparisons')
except FileNotFoundError:
    pass



In [104]:
# Get (100 obs of) tldr data from the API
n_obs = 20
tldr_url = "https://datasets-server.huggingface.co/" + \
           f"rows?dataset=openai%2Fsummarize_from_feedback&config=axis&split=validation&offset=0&length={n_obs}"

r = requests.get(tldr_url)
assert r.ok

tldr_dict = r.json()

print(len(tldr_dict['rows']))

20


In [101]:
# nb: tldr_dict['rows'][0]['row']['info']['id'] defines the post id (i.e. there are several summaries per post)
tldr_dict['rows'][0]


{'row_idx': 0,
 'row': {'info': {'id': 't3_4l0bal',
   'post': "Recently, my fiance  (20 m) and I (19f) moved into a new apartment with a mutual friend (20m) and somehow contracted scabies (don't know how). We've both been itchy af and have been to the doctor who confirmed that it was scabies for the both of us. Our room mate (20m) has not had symptoms of scabies bites appear yet but I have asked him to get treated as well and to treat his clothes and linen so that our apartment does not get reinfested after treatment.\n\nMy room mate refuses to buy the lotion needed to kill the mites on his skin (if there are any on him) and refuses to rewash and dry his linen and clothes. I'm scared that if he does not get treated the infestation of our apartment will not go away. I'm almost there to asking him to move out if he refuses treatment . He is not on the lease.",
   'title': '19f with fiance 20m and roommate 19m- fiance and I recently got infected with scabies and have started treatment, r

In [84]:
class PostSummary:
    
    # Parse contents from API response
    def __init__(self, dict):
        self.post_id = dict['row']['info']['id']
        self.post_text = dict['row']['info']['post']
        self.post_summary = dict['row']['summary']['text']
        self.policy = dict['row']['summary']['policy']

    def __repr__(self):
        return f"Summary for {self.post_id} by {self.policy}"

    def to_dict(self):
        return {k: v for k, v in self.__dict__.items()}


# get_post_id = lambda dict: dict['row']['info']['id']
# get_post_text = lambda dict: dict['row']['info']['post']
# get_post_summary = lambda dict: dict['row']['summary']['text']


In [43]:
p = PostSummary(tldr_dict['rows'][0])
p.to_dict()

dict_items([('post_id', 't3_4l0bal'), ('post_text', "Recently, my fiance  (20 m) and I (19f) moved into a new apartment with a mutual friend (20m) and somehow contracted scabies (don't know how). We've both been itchy af and have been to the doctor who confirmed that it was scabies for the both of us. Our room mate (20m) has not had symptoms of scabies bites appear yet but I have asked him to get treated as well and to treat his clothes and linen so that our apartment does not get reinfested after treatment.\n\nMy room mate refuses to buy the lotion needed to kill the mites on his skin (if there are any on him) and refuses to rewash and dry his linen and clothes. I'm scared that if he does not get treated the infestation of our apartment will not go away. I'm almost there to asking him to move out if he refuses treatment . He is not on the lease."), ('post_summary', " Fiance and I recently got infected with scabies. Room mate refuses to get treated and our apartment will not go away. I

In [85]:
# group summaries by post
summaries = {}
for p in tldr_dict['rows']:
    s = PostSummary(p)

    try:
        summaries[s.post_id].append(s)
    except KeyError:
        summaries[s.post_id] = [s]
        

In [100]:
id_example = list(summaries.keys())[0]
df = pd.DataFrame.from_dict([summary.to_dict() for summary in summaries[id_example]])

df.to_csv(processed / 'summary_examples.csv')

df


Unnamed: 0,post_id,post_text,post_summary,policy
0,t3_4l0bal,"Recently, my fiance (20 m) and I (19f) moved ...",Fiance and I recently got infected with scabi...,sup4_ppo_rm4_t.7
1,t3_4l0bal,"Recently, my fiance (20 m) and I (19f) moved ...",my fiance and I refuse to treat our room mate...,pretrain_6b_t.7
2,t3_4l0bal,"Recently, my fiance (20 m) and I (19f) moved ...","fiancé and I contracted scabies, roommate ref...",sup4_6b_ppo_rm4_6b_t.7
3,t3_4l0bal,"Recently, my fiance (20 m) and I (19f) moved ...","fiance and I are infected with scabies, room ...",sup4_6b_t0.7
4,t3_4l0bal,"Recently, my fiance (20 m) and I (19f) moved ...","Fiance and I contracted scabies, roommate ref...",sup4_12b_t0.7
5,t3_4l0bal,"Recently, my fiance (20 m) and I (19f) moved ...","infestation of scabies mites in apartment, ro...",ref
