In [1]:
import string
import ftfy
import json
import re
import os

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from tqdm.auto import tqdm
import pandas as pd

from utils import search_engine, claim_extraction, QUOTES, emoji_pattern

SPACES = re.compile(r"\s+")
LINES = re.compile(r"(\s*\n\s*)+")
BOLD = re.compile(r"<b>(.*?)</b>")
STOPWORDS = set(stopwords.words("portuguese"))

PREPRO_DIR = "data/parquet/prepro"

tqdm.pandas()

In [None]:
data = dict()

for dataset_path in os.listdir(PREPRO_DIR):
    data[dataset_path[:-8]] = pd.read_parquet(f'{PREPRO_DIR}/{dataset_path}')

**Algorithm:**

1.  **Query Selection**
    *   The entire claim if *n_words* <= 20 words
    *   The exact first sentence if *n_words* >= 7
    *   The first paragraph if *n_words* <= 20 words
    *   The first 20 words
2.  **Exact Search**
    *   If there are exact *matches*, add an exact match flag and return *near matches*.
    *   If not, perform *query_extraction* and search using the extracted query.
        *   For *query_extraction*, extract up to 75 words.
3.  **From the TOP 5 results, add a flag if it is an agency or a reliable source.**

In [9]:
def extend_with_search(text: str):
    text = ftfy.fix_text(text)
    text = emoji_pattern.sub(r'', text)
    for quote in QUOTES:
        text = text.replace(quote, "")
    text = text.strip()

    words = re.split(SPACES, text)

    if len(words) <= 20:
        query = text
    else:
        fst_sent = sent_tokenize(text, language='portuguese')[0]

        if len(re.split(SPACES, fst_sent.strip())) >= 7:
            query = fst_sent
        else:
            fst_paragraph = re.split("\n+", text)[0]
            
            if len(re.split(SPACES, fst_paragraph.strip())) < 20:
                query = " ".join(words[:20])
            else:
                query = fst_paragraph

    results = search_engine(query, n=5)

    query_no_punc = query.translate(str.maketrans('', '', string.punctuation))
    words_set = set(re.split(SPACES, query_no_punc.strip().lower())) - STOPWORDS

    had_match = False
    if results:
        for r in results:
            if r["snippet"]:
                matches = re.findall(BOLD, r["snippet"])
                matches = [re.split(SPACES, w.strip().lower()) for w in matches if w.upper().isupper()]
                
                matches = {w for match in matches for w in match} - STOPWORDS
                r["match"] = len(words_set.intersection(matches))/len(words_set)
            else:
                r["match"] = None

            if r.get("match") and r["match"] > 0.8:
                had_match = True

    claim = None
    if not had_match:
        input_text  = " ".join(words[:75]) if len(words) > 75 else text
        claim = claim_extraction.invoke({"text": input_text})
        claim = claim.content.strip()
        for quote in QUOTES:
            claim = claim.replace(quote, "")
        results = search_engine(claim, n=5)
        
    results = pd.Series({
        "query": query,
        "claim": claim,
        "results": results
    })

    return results

In [None]:
if not os.path.exists("results.tmp"):
    with open("results.tmp", "w") as f:
        json.dump({}, f)

with open("results.tmp", "r") as f:
    results = json.load(f)

for dataset in ['MuMiN-PT', 'COVID19.BR', 'Fake.br']:
    for idx, row in tqdm(data[dataset].iterrows(), total=data[dataset].shape[0]):
        if f"{dataset}_{idx}" in results.keys() and results[f"{dataset}_{idx}"]["results"] != None:
            continue
        
        result = extend_with_search(row["text_no_url"])

        if result["results"] == None:
            tqdm.write(row["text_no_url"])

        if "query" in row.index:
            for column in result.columns:
                row[column] = result[column]
        else:
            row = pd.concat([row, result])

        results[f"{dataset}_{idx}"] = json.loads(row.to_json())
        with open("results.tmp", "w") as f:
            json.dump(results, f)

In [4]:
for dataset in ['Fake.br', 'COVID19.BR', 'MuMiN-PT']:
    data[dataset] = pd.DataFrame({"_".join(r.split("_")[1:]): results[r] for r in results if dataset in r}).T
    
    data[dataset].to_parquet(f"{PREPRO_DIR}/../final/{dataset}.parquet")   