# Stage 1: Role Labeling - Convert Format

In [30]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from huggingface_hub import InferenceClient
from transformers import BertTokenizer
from utils.preprocessing import *
from utils.accelerators import *
from utils.multithreading import *
from utils.database import *
from utils.model import *
from utils.files import *
from datasets import Dataset
from rouge import Rouge
from tqdm import tqdm
import statistics
import hashlib
import random
import openai
import time
import math
import re

### Connect to Database

Credentials are sourced from the `.env` file.

In [31]:
_, db = getConnection(use_dotenv=True)

**Fetch Documents**

In [32]:
def fetchArticleTexts(db, limit: int = 0, skip: int = 0, fields: dict = {},  query: dict = {}, collection="articles.sampled.triplets"):
    """Returns texts extracted from news articles"""
    tasks = db[collection].find(query, fields).limit(limit).skip(skip)
    return list(tasks)

In [33]:
articles = fetchArticleTexts(db, limit=0, fields={"_id": 1, "triplets": 1})

In [34]:
articles[0]

{'_id': ObjectId('64d8eb3a516b265872293a5c'),
 'triplets': [{'chunk_id': 0,
   'chunk': 'January 14, 2016\nEntertainment, Movies The Revenant led this morning’s 2016 Oscar nominations with 12 nods. At age 25, Jennifer Lawrence became the youngest to win a fourth nomination. From the Washington Post: The nominations for the 88th Academy Awards were announced Thursday morning. In acting categories, many of the same stars who were honored during the Golden Globes on Sunday were nominated, including Brie Larson for “Room,” Matt Damon for “The Martian” and, of course, Leonardo DiCaprio, for “The Revenant.” There has been particular interest in that last potential nominee, given that DiCaprio has never won an Oscar and that movie was so grueling to shoot. Best picture\n“Spotlight”\n“The Revenant”\n“Mad Max: Fury Road”\n“The Big Short”\n“Bridge of Spies”\n“Brooklyn”\n“Room”\n“The Martian” Actor in a leading role\nLeonardo DiCaprio, “The Revenant”\nMatt Damon, “The Martian”\nMichael Fassbender

**Source format:**

```json
"triplets": [{"chunk_id": 0,
   "chunk": "January 14, 2016\nEntertainment, Movies The Revenant led this morning’s 2016 Oscar nominations with 12 nods. At age 25, Jennifer Lawrence became the youngest to win a fourth nomination. From the Washington Post: The nominations for the 88th Academy Awards were announced Thursday morning. In acting categories, many of the same stars who were honored during the Golden Globes on Sunday were nominated, including Brie Larson for “Room,” Matt Damon for “The Martian” and, of course, Leonardo DiCaprio, for “The Revenant.” ",
   "answers": [{"index": 0,
     "text": "\n{hero: "Leonardo DiCaprio", villain: "None", victim: "None"}",
     "logprobs": None,
     "finish_reason": "stop",
     "triplet": {"hero": "Leonardo DiCaprio",
      "villain": "None",
      "victim": "None"}},
    {"index": 1,
     "text": "\n{hero: "Leonardo DiCaprio", villain: "None", victim: "None"}",
     "logprobs": None,
     "finish_reason": "stop",
     "triplet": {"hero": "Leonardo DiCaprio",
      "villain": "None",
      "victim": "None"}},
    {"index": 2,
     "text": "\n{hero: "Leonardo DiCaprio", villain: "None", victim: "None"}",
     "logprobs": None,
     "finish_reason": "stop",
     "triplet": {"hero": "Leonardo DiCaprio",
      "villain": "None",
      "victim": "None"}}]}]
```

***

**Target format:**

```json
 "processing_result": {
    "hero": [
      "Leonardo DiCaprio",
      "Jennifer Lawrence"
    ],
    "villain": [
      "The Revenant",
      "Jason Priestly"
    ],
    "victim": [
      "Jennifer Lawrence",
      "Jennifer Lawrence"
    ]
  },
```

In [41]:
article = articles[0]

def convertToDict(article):
    """Converts article to a dictionary"""

    hero, villain, victim = [], [], []

    for chunk in article.get("triplets",[]):
        for answer in chunk.get("answers",[]):
            triplet = answer.get("triplet", [])
            hero.append(triplet.get("hero", ""))
            villain.append(triplet.get("villain", ""))
            victim.append(triplet.get("victim", ""))
            #break # For only a single answer
        #break # For only a single chunk

    return {"hero": hero, "villain": villain, "victim": victim}

**Convert Format and update articles:**

In [42]:
def updateArticle(db, id: str, values: dict = {}, collection="articles.sampled.triplets"):
    "Updates scraping task in database"
    filter = {"_id": ObjectId(id)}
    values = {"$set": {**values}}
    r = db[collection].update_one(filter, values)
    return r

In [43]:
for article in tqdm(articles):
    values = {"processing_result" : convertToDict(article)}
    updateArticle(db, article["_id"], values)

100%|██████████| 227094/227094 [04:55<00:00, 767.63it/s] 
