In [1]:
import os

from sklearn.model_selection import (
    train_test_split
)
import numpy as np
import pandas as pd

data_dir = "../data/parquet/fact_check/"

SEED = 2025

In [2]:
corrections = {
    "MuMiN-PT": [
        "Associação Médica Americana",
        "produtividade",
        "leoa",
        "Araraquara",
    ],
    "COVID19.BR": [
        "Fernanda Torres",
        "oms-pede-desculpas-pelo-erro-nas-controversia-sobre-hidroxicloroquina",
        "assintomaticos-nao-contribuem-para-a-propagacao-do-virus",
        "Segundo entendi",
        "banco-mundial-classifica-o-brasil",
        "vitamina D",
        "Antiparasitário",
        "45 - 19",
        "FDA",
        "nunca-aconselhou",
        "brasilsemmedo.com/dra-nise-e-a-batalha-para-salvar-o-brasil-do-virus"
    ]
}

for dataset in corrections.keys():
    fname = f'../data/parquet/fact_check/{dataset}.parquet'

    df = pd.read_parquet(fname)
    df["label"] = df.apply(
        lambda r: "fake" if any(span in r["text"] for span in corrections[dataset]) else r["label"], axis=True
    )
    df.to_parquet(fname)

In [3]:
data = dict()
raw_data = dict()
near_duplicates = dict()
fix = dict()

for dataset_path in os.listdir(data_dir):
    dataset = dataset_path[:-8]    
    data[dataset] = pd.read_parquet(f"{data_dir}/{dataset}.parquet")
    raw_data[dataset] = pd.read_parquet(f'../data/parquet/original/{dataset}.parquet')
    raw_data[dataset].index = [str(idx) for idx in raw_data[dataset].index]

    fix_path = f'../data/parquet/fix2/{dataset}.parquet'
    if os.path.exists(fix_path):
        fix[dataset] = pd.read_parquet(fix_path)
    else:
        fix[dataset] = None

    near_duplicates[dataset] = pd.read_parquet(f'../data/parquet/prepro/{dataset}.parquet')["near_duplicates"]
    near_duplicates[dataset].index = [str(idx) for idx in near_duplicates[dataset].index]

if "fake_1" in data["Fake.br"].index:
    index = [idx.split("_") for idx in data["Fake.br"].index]
    data["Fake.br"].index = [f"{idx[0]}_{int(idx[1]):04}" for idx in index]
    data["Fake.br"] = data["Fake.br"].sort_index(key=lambda idx: idx.str.replace(r"(.*)_(.*)", r"\2_\1", regex=True))

In [4]:
def check_labels(row):
    if row.name not in data[dataset].index:
        return False
    
    if "manual_review" in data[dataset].columns and data[dataset].loc[str(row.name)]["manual_review"] == True:
        return False

    return row["label"] != data[dataset].loc[str(row.name)]["label"]

def fix_few_tokens(row):
    if row["only_url"] | row["unrelated_content"] | row["null"]:
        return False
    
    return row["few_tokens"]

for dataset in data:
    review_idx = raw_data[dataset].columns.to_list().index("text_no_url") + 1
    raw_data[dataset][raw_data[dataset].columns[review_idx:]] = \
        raw_data[dataset][raw_data[dataset].columns[review_idx:]].fillna(False)
    raw_data[dataset]["google_fact_check"] = raw_data[dataset].apply(lambda row: check_labels(row), axis=True)
    if "few_tokens" in raw_data[dataset].columns:
        raw_data[dataset]["few_tokens"] = raw_data[dataset].apply(fix_few_tokens, axis=True)
    raw_data[dataset].to_parquet(f'../data/parquet/original/{dataset}.parquet')

if data["Fake.br"].value_counts("label")["fake"] == 3600:
    to_remove = data["Fake.br"].apply(lambda row: raw_data["Fake.br"]["label_align"].get(row.name, True),axis=True)
    data["Fake.br"] = data["Fake.br"][~to_remove]

In [5]:
rename_cols = {
    "query": "initial_query",
    "claim": "claim_query",
    "results": "google_search_results",
    'claim_results': "google_fact_check_results",
    "label": "label"
}

for dataset in data:
    if set(data[dataset].columns).intersection(rename_cols.keys()) != {"label"}:
        derived_info = data[dataset][list(rename_cols.keys())].copy()
        derived_info.rename(columns=rename_cols, inplace=True)
        
        remove_idx = raw_data[dataset].columns.to_list().index("text_no_url") + 1
        remove_cols = raw_data[dataset].columns[remove_idx:].to_list()

        for relabel_col in ["manual_review", "google_fact_check"]:
            if relabel_col in remove_cols:
                remove_cols.remove(relabel_col)

        not_removed = ~raw_data[dataset][remove_cols].any(axis=True)

        new_data = raw_data[dataset][not_removed]

        old_columns = new_data.columns[:remove_idx].to_list()
        old_columns.remove("label")

        new_data = new_data[old_columns]
        new_data.index = [str(idx) for idx in new_data.index]
        
        refactor_data = pd.concat([new_data, derived_info], axis=True)
        refactor_data = refactor_data[refactor_data["text"].notna()]
        refactor_data["near_duplicates"] = near_duplicates[dataset]

        refactor_data["label"] = refactor_data.apply(
            lambda row: row.label if pd.notna(row.label) else raw_data[dataset].label[row.name], axis=True
        )
    else:
        refactor_data = data[dataset]

    if fix[dataset] is not None:

        assert fix[dataset].index.equals(refactor_data[refactor_data["initial_query"].isna()].index)

        for new_column in rename_cols.values():
            if new_column == "label":
                continue

            refactor_data[new_column] = refactor_data.apply(
                lambda row: row[new_column] if not isinstance(row[new_column], float) \
                     else fix[dataset][new_column].get(row.name),
                axis=True
            )

            

    assert refactor_data.shape[0] == new_data.shape[0]
    assert refactor_data["text"].notna().all()
    assert refactor_data["label"].notna().all()

    data[dataset] = refactor_data

In [7]:
reviews = list()


for dataset in data:
    review_idx = raw_data[dataset].columns.to_list().index("text_no_url") + 1
    review_columns = raw_data[dataset].columns[review_idx:]
    review = raw_data[dataset][review_columns]
    review = review.sum().to_frame(dataset).T
    reviews.append(review)

reviews = pd.concat(reviews)
reviews = reviews.fillna(0).astype(int)
reviews = reviews.T
reviews["total"] = reviews.sum(axis=True)

reviews = reviews.sort_values("total", ascending=False)
reviews.drop(columns="total")

Unnamed: 0,COVID19.BR,Fake.br,MuMiN-PT
only_url,501,0,0
few_tokens,302,0,0
unrelated_content,82,0,0
label_align,0,40,0
google_fact_check,23,0,4
contradicts,20,0,0
url_duplicated,0,17,0
not_pt,8,0,0
no_full_text,0,4,0
manual_review,3,0,0


In [8]:
review_article = reviews.drop(columns="total").copy()

auto = ["only_url", "few_tokens", "null", "duplicated"]

auto_df = review_article.loc[auto].sum().to_frame("auto").T
review_article = pd.concat([review_article, auto_df]).drop(index=auto)

fakebr = ["no_full_text", "url_duplicated", "label_align"]

fakebr_df = review_article.loc[fakebr].sum().to_frame("Fake.br").T
review_article = pd.concat([review_article, fakebr_df]).drop(index=fakebr)


print(
    review_article.to_latex(
        bold_rows=True, escape=True, multicolumn_format="c",
    )
)

\begin{tabular}{lrrr}
\toprule
 & COVID19.BR & Fake.br & MuMiN-PT \\
\midrule
\textbf{unrelated\_content} & 82 & 0 & 0 \\
\textbf{google\_fact\_check} & 23 & 0 & 4 \\
\textbf{contradicts} & 20 & 0 & 0 \\
\textbf{not\_pt} & 8 & 0 & 0 \\
\textbf{manual\_review} & 3 & 0 & 0 \\
\textbf{auto} & 804 & 1 & 0 \\
\textbf{Fake.br} & 0 & 61 & 0 \\
\bottomrule
\end{tabular}



In [9]:
show_columns = [ 
   "text_no_url", "old_split", "new_split", "label", "initial_query", "claim_query", "google_search_results", "google_fact_check_results", "near_duplicates", "text_urls", "metadata"
]

split_raw_data = dict()
split_data = dict()
for dataset in ["Fake.br", "COVID19.BR", "MuMiN-PT"]:
   review_idx = raw_data[dataset].columns.to_list().index("text_no_url") + 1
   review_columns = raw_data[dataset].columns[review_idx:].to_list()

   changed = raw_data[dataset][review_columns].any(axis=True)
    
   split_raw_data[dataset] = raw_data[dataset][["text_no_url", "label"]].copy()
   split_raw_data[dataset]["old_split"] = raw_data[dataset].get("split")

   split_raw_data[dataset]["new_split"] = split_raw_data[dataset].apply(
      lambda r: "train" if changed.loc[r.name] else None,axis=True)

   new_percent = 0.8*split_raw_data[dataset].shape[0] - changed.sum()
   new_percent /= split_raw_data[dataset].shape[0] - changed.sum()  


   if dataset == "Fake.br":
      not_changed = {idx.split("_")[1] for idx in split_raw_data[dataset][~changed].index}
      _, test_dev_pair = train_test_split(list(not_changed), train_size=new_percent)

      dev_pair, test_pair = train_test_split(test_dev_pair, test_size=0.5)
      not_changed_idx = split_raw_data[dataset][~changed].apply(lambda r: r.name.split("_")[-1], axis=True)

      split_raw_data[dataset].loc[~changed, "new_split"] = [
         "test" if item in test_pair else "dev" if item in dev_pair else "train" for item in not_changed_idx
      ]
   else:
      not_changed_range = range(raw_data[dataset][~changed].shape[0])

      _, test_dev = train_test_split(
         not_changed_range, train_size=new_percent, stratify=split_raw_data[dataset][~changed]["label"]
      )
      dev, test = train_test_split(
         test_dev, test_size=0.5, stratify=split_raw_data[dataset][~changed]["label"].iloc[test_dev]
      )
      
      split_raw_data[dataset].loc[~changed, "new_split"] = [
         "test" if item in test else "dev" if item in dev else "train" for item in not_changed_range
      ]

   split_info = split_raw_data[dataset].drop(columns=["label"])
   split_info.index = split_info.index.map(str)

   split_data[dataset] = data[dataset].drop(columns=["text", "text_no_url"])


   split_data[dataset] = split_data[dataset].join(split_info)
   split_data[dataset] = split_data[dataset][show_columns]
   
   
   print(dataset)
   print(split_raw_data[dataset]["new_split"].value_counts()/split_raw_data[dataset].shape[0])
   print(split_data[dataset]["new_split"].value_counts()/split_data[dataset].shape[0])
   print("-"*12)

   split_raw_data[dataset]["text_no_url"]

   split_raw_data[dataset].to_parquet(f"{data_dir}/../split/{dataset}_raw.parquet")
   split_data[dataset].to_parquet(f"{data_dir}/../split/{dataset}.parquet")


   #assert split_raw_data[dataset]["text_no_url"].notna().all()
   assert split_raw_data[dataset]["label"].notna().all()
   #assert split_data[dataset]["text_no_url"].notna().all()
   assert split_data[dataset]["label"].notna().all()
   

Fake.br
new_split
train    0.8
test     0.1
dev      0.1
Name: count, dtype: float64
new_split
train    0.798883
test     0.100559
dev      0.100559
Name: count, dtype: float64
------------
COVID19.BR
new_split
train    0.799931
test     0.100034
dev      0.100034
Name: count, dtype: float64
new_split
train    0.708103
test     0.145949
dev      0.145949
Name: count, dtype: float64
------------
MuMiN-PT
new_split
train    0.799858
test     0.100427
dev      0.099715
Name: count, dtype: float64
new_split
train    0.799858
test     0.100427
dev      0.099715
Name: count, dtype: float64
------------
