In [None]:
import nltk
from nltk import FreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import string
import os

nltk.download('punkt')

import pandas as pd

In [None]:
def load_debunks():
    """Loads a dataframe containing the debunk articles from EuvsDisinfo."""
    path_raw = os.path.join("../../data/raw/")
    dfs = []
    for fname in os.listdir(path_raw):
        if fname.endswith(".json"):
            fpath = os.path.join(path_raw, fname)

            df = pd.read_json(fpath)
            df = pd.DataFrame(df["disinfoCases"].tolist())
            dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df.drop_duplicates(subset="id", inplace=True)
    df.reset_index(drop=True, inplace=True)

    df = df.rename({"id": "debunk_id"}, axis=1)
    return df

In [None]:
df = pd.read_csv('../../data/euvsdisinfo_full.csv')
df

In [None]:
debunks_df = load_debunks()
debunks_df

In [None]:
def remove_punctuation(input_string):
    # Using string.punctuation to get a string of all ASCII punctuation characters
    translator = str.maketrans('', '', string.punctuation)
    
    # Removing punctuation using translate method
    result_string = input_string.translate(translator)
    
    return result_string

def html_to_text(html_string):
    soup = BeautifulSoup(html_string, "html.parser")
    text = soup.get_text()
    return text

def get_ngrams(text, n):
    tokens = word_tokenize(text)
    n_grams = ngrams(tokens, n)
    return [' '.join(gram) for gram in n_grams]

def get_top_ngrams(text, n, top_n):
    n_grams = get_ngrams(text, n)
    freq_dist = FreqDist(n_grams)
    return freq_dist.most_common(top_n)

input_text = " ".join(debunks_df["disproof"].str.casefold().apply(html_to_text).apply(remove_punctuation))

top_unigrams = get_top_ngrams(input_text, 1, 50)
top_bigrams = get_top_ngrams(input_text, 2, 50)
top_trigrams = get_top_ngrams(input_text, 3, 50)
top_4grams = get_top_ngrams(input_text, 4, 50)

print("Top 50 Unigrams:")
print(top_unigrams)

print("\nTop 50 Bigrams:")
print(top_bigrams)

print("\nTop 50 Trigrams:")
print(top_trigrams)

print("\nTop 50 4-grams:")
print(top_4grams)


In [None]:
pd.DataFrame(top_4grams, columns=["4gram", "count"])

In [None]:
pd.DataFrame(top_trigrams, columns=["3gram", "count"])

In [None]:
filter_rules = [
    "recurring prokremlin disinformation narrative",
    "prokremlin disinformation narrative about",
    "disinformation narrative about the",
    "see other examples of",
    "a recurring prokremlin disinformation",
    "this is a recurring",
    "disinformation cases alleging that",
    "similar cases claiming that",
    "prokremlin disinformation narratives about",
    "recurring prokremlin disinformation narratives",
    "read more about the",
    "read similar cases claiming",
    "is a recurring prokremlin",
    "other examples of similar",
    "recurring prokremlin narrative about",
    "a recurring prokremlin narrative",
    "a recurring disinformation narrative",
    "earlier disinformation cases alleging",
    "see earlier disinformation cases",
    "disinformation narratives about the",
    "recurring prokremlin disinformation",
    "prokremlin disinformation narrative",
    "disinformation narrative about",
    "a recurring prokremlin",
    "see other examples",
    "prokremlin disinformation narratives",
    "recurring prokremlin narrative",
    "other examples of",
    "disinformation narratives about",
    "is a recurring",
]

In [None]:
# tokenize the disproof text into sentences
debunks_df['sentences'] = debunks_df['disproof'].apply(sent_tokenize)
debunks_df['sentences']

In [None]:
sentences = debunks_df["sentences"].explode().tolist()

filtered_sentences = []
for sentence in sentences:
    clean_sentence = remove_punctuation(html_to_text(sentence.lower()))
    for rule in filter_rules:
        if rule in clean_sentence:
            filtered_sentences.append(sentence)
            break



In [None]:
filtered_sentences

In [None]:
urls = []
urls = [BeautifulSoup(sentence, "html.parser").find_all("a") for sentence in filtered_sentences]
urls = [url.get("href") for lurl in urls for url in lurl]
urls

In [None]:
matched_urls = df[df['article_url'].isin(urls)]
matched_urls

In [None]:
matched_urls["article_publisher"].value_counts().head(50)

In [None]:
trustworthy = [
    "bbc",
    "reuters",
    "the guardian",
    "dw.com",
    "radiofreeeurope/radioliberty",
    "washington post",
    "cnn",
    "ap news",
    "euronews",
    "politico",
    "npr",
    "new york times",
    "france 24",
    "polygraph.info"
]

In [None]:
to_remove_df = matched_urls[~matched_urls["article_publisher"].isin(trustworthy)]
to_remove_df