In [5]:
import pandas as pd
from IPython.display import display
import os
import json
import spacy
from spacy.tokens import Doc
from spacy.lang.pt.examples import sentences 
from newspaper import Article, Source

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from string import punctuation
from collections import Counter
import matplotlib.pyplot as plt 
from spacy.lang.pt.stop_words import STOP_WORDS

from fuzzywuzzy import fuzz

import requests
from bs4 import BeautifulSoup
import time



In [6]:
nlp = spacy.load('pt_core_news_sm') # commented out once it's loaded

punctuations = list(punctuation)

### Newspapers, stopwords, and words to filter out of keywords

In [7]:
# Newspapers to search

newsp = ['cmjornal.pt/', 
         'dn.pt/',
         'expresso.pt/',
         'folhanacional.pt/',
         'jn.pt/',
         'ionline.sapo.pt/',   
         'sol.sapo.pt/',
         'observador.pt/',
         'publico.pt/',
         'sabado.pt/',
         'sapo.pt/',
         'visao.pt/',
         ]

not_keywords = ["/multimedia", "/videos", 
                "/famosos", "/celebridades", 
                "/tecnologia", "/boa-vida", 
                "/tendencias", "/desporto",
                "/maissobre", "/ciencia",
                "/cinema", "/cultura",
                "/cidades", "/dinheiro",
                "/blogues", "/podcasts",
                "/vida", "/artes",
                "/iniciativas", "/colunistas/andre",
                "/tribuna"]

stop_words = {'comprida', 'lá', 'mesmo', 'ela', 'de', 'quero', 'vão', 'nesta', 'em', 'com',
              'tais', 'neste', 'obrigado', 'bom', 'for', 'vai', 'fazer', 'tens', 'alguns', 'todos',
              'tu', 'muito', 'estás', 'deve', 'novas', 'todo', 'foste', 'pode', 'pontos', 'pegar',
              'quieto', 'seria', 'eles', 'coisa', 'faz', 'sou', 'daquela', 'tua', 'suas', 'teu',
              'cuja', 'esses', 'vindo', 'aquele', 'um', 'certamente', 'essa', 'no', 'vens', 'também',
              'maiorias', 'fostes', 'tanta', 'elas', 'desse', 'diante', 'pelas', 'nove', 'quinta', 'maior',
              'do', 'quando', 'aqueles', 'desta', 'sabe', 'ambas', 'que', 'tentar', 'estes', 'próximo',
              'povo', 'sem', 'uns', 'tão', 'além', 'geral', 'porquanto', 'fora', 'vêm', 'tivemos',
              'depois', 'ir', 'ora', 'tarde', 'nessa', 'da', 'nós', 'ver', 'tiveram', 'tem',
              'cada', 'exemplo', 'número', 'meus', 'tuas', 'conhecido', 'cento', 'devem', 'para', 'fomos',
              'à', 'favor', 'quanto', 'teve', 'apenas', 'este', 'vais', 'mas', 'querem', 'lugar',
              'vinte', 'quê', 'após', 'apoio', 'mal', 'quarta', 'grande', 'adeus', 'pelos', 'parece',
              'dezanove', 'ligado', 'sob', 'estou', 'ao', 'final', 'talvez', 'demais', 'aos', 'agora',
              'você', 'nem', 'maioria', 'vez', 'treze', 'menor', 'temos', 'possível', 'ambos', 'não',
              'isto', 'dezoito', 'tempo', 'estas', 'área', 'nenhuma', 'fui', 'estará', 'cujo', 'faço',
              'bem', 'logo', 'quais', 'onde', 'menos', 'aquela', 'quatro', 'outras', 'pouca', 'dar',
              'dezasseis', 'último', 'valor', 'é', 'antes', 'aqui', 'mais', 'sétimo', 'dizem', 'estão',
              'veja', 'dezassete', 'ele', 'tivestes', 'tive', 'quarto', 'nova', 'bastante', 'pois', 'apoia',
              'outra', 'teus', 'caminho', 'fazeis', 'lado', 'somente', 'esteve', 'fazemos', 'nesse', 'dessa',
              'numa', 'qualquer', 'momento', 'assim', 'esse', 'boa', 'pôde', 'duas', 'longe', 'meio',
              'saber', 'és', 'ponto', 'daquele', 'estava', 'irá', 'perto', 'num', 'aí', 'umas',
              'nossas', 'falta', 'o', 'dão', 'quer', 'custa', 'tipo', 'sexto', 'vinda', 'segundo',
              'sétima', 'ter', 'seu', 'poder', 'enquanto', 'algo', 'sois', 'se', 'poderá', 'grandes',
              'naquele', 'dá', 'novos', 'tanto', 'cedo', 'vossas', 'cima', 'posição', 'era', 'na',
              'ser', 'vossos', 'seis', 'terceira', 'fim', 'onze', 'disso', 'tentei', 'iniciar', 'minha',
              'apontar', 'estiveram', 'quem', 'fará', 'até', 'sexta', 'zero', 'pouco', 'então', 'sei',
              'vossa', 'ainda', 'nuns', 'parte', 'sua', 'a', 'mês', 'acerca', 'terceiro', 'primeiro',
              'estar', 'oitava', 'grupo', 'porquê', 'contra', 'me', 'forma', 'dizer', 'sete', 'toda',
              'dois', 'portanto', 'estado', 'podia', 'vos', 'máximo', 'vocês', 'comprido', 'tenho', 'baixo',
              'somos', 'através', 'são', 'quinto', 'nada', 'segunda', 'outros', 'sempre', 'vosso', 'certeza',
              'debaixo', 'oitavo', 'lhe', 'dos', 'mil', 'têm', 'desde', 'nível', 'nos', 'usar',
              'primeira', 'fazes', 'fazia', 'tiveste', 'deste', 'corrente', 'oito', 'vós', 'doze', 'das',
              'sistema', 'inclusive', 'já', 'vem', 'contudo', 'minhas', 'cinco', 'estivestes', 'naquela', 'nas',
              'vários', 'tal', 'fazem', 'tentaram', 'posso', 'estiveste', 'direita', 'isso', 'embora', 'pela',
              'uma', 'breve', 'nossa', 'números', 'nossos', 'essas', 'dentro', 'deverá', 'partir', 'próxima',
              'eventual', 'conselho', 'às', 'esta', 'foi', 'só', 'te', 'possivelmente', 'obrigada', 'relação',
              'meses', 'três', 'põem', 'quieta', 'fez', 'ali', 'aquilo', 'sobre', 'qual', 'atrás',
              'estive', 'está', 'sim', 'estivemos', 'todas', 'cá', 'ou', 'muitos', 'porque', 'foram',
              'algumas', 'pelo', 'por', 'tente', 'questão', 'nosso', 'local', 'novo', 'nunca', 'como',
              'meu', 'tudo', 'porém', 'as', 'vezes', 'usa', 'os', 'diz', 'e', 'dez',
              'ademais', 'seus', 'conhecida', 'ontem', 'quinze', 'põe', 'des', 'entre', 'eu', 'catorze',
              'próprio', 'tendes', 'aquelas', 'podem', 'inicio', 'puderam', 'de o', 'correio', ' ', 'manhã',
              'a o', 'comentário', 'conteúdo', 'espaço', 'leitor', '', '”','“'}

In [8]:
def extract_data_from_json(folder_path):
    """
    Extracts 'url' and 'timestamp' from all JSON files in the specified folder and stores them in a DataFrame.
    
    :param folder_path: Path to the folder containing JSON files.
    :return: A pandas DataFrame with columns 'url' and 'timestamp'.
    """
    data = []

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a JSON file
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            # Open and load the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                try:
                    json_data = json.load(file)
                    # Check if the data is a list of dictionaries
                    if isinstance(json_data, list):
                        for entry in json_data:
                            # Extract 'url' and 'timestamp' if available
                            if 'url' in entry and 'timestamp' in entry:
                                data.append({
                                    'url': entry['url'],
                                    'timestamp': entry['timestamp']
                                })
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON file {file_name}: {e}")

    # Convert collected data into a DataFrame
    df = pd.DataFrame(data, columns=['url', 'timestamp'])
    return df


In [9]:
df = extract_data_from_json("/Users/joaop.cardoso/MestradoCD/FCD/FDS_Project/cdx_results_json_files/data")
# df = extract_data_from_json("/Users/hannes_seidl/Desktop/UA_Aveiro/WS24:25/FDS/FDS_Project/cdx_results_json_files/data") # Hannes filepath

In [10]:
# Use this command to avoid running the code above
df = pd.read_csv('full_dataset.csv')

In [11]:
def filter_links(df, keywords):
    """
    Filters out rows from the DataFrame by assessing each keyword individually.
    Returns the result as a dictionary.
    
    :param df: A pandas DataFrame with at least 'url' and 'timestamp' columns.
    :param keywords: A list of keywords to filter out.
    :return: A dictionary with 'url' and 'timestamp' for rows that do not contain any of the keywords.
    """
    filtered_df = df.copy()

    # Iterate over each keyword and filter rows
    for keyword in keywords:
        filtered_df = filtered_df[~filtered_df['url'].str.contains(keyword, case=False, na=False)]
    
    # Convert the filtered DataFrame to a dictionary
    result_dict = pd.DataFrame(filtered_df)
    return result_dict


In [12]:
# Removing all the unwanted links, by filtering out by tags in the links
df_clean = filter_links(df, not_keywords)

In [13]:
# Function to process the title
def title_input(df): 
    processed_texts = []  # Initialize inside the function

    # Extract the last part of each URL, handling NaN values
    last_part = df['url'].str.rsplit('/').str[-1]  # Extract the last part of the URL
    last_part = last_part.fillna("")  # Replace NaN values with an empty string

    # Handle cases where the last part is empty
    last_part[last_part == ""] = df['url'].str.rsplit('/').str[-2].fillna("")  # Use the second-to-last part if last is empty
    
    for part in last_part:
        if "-" in part:
            parts = part.rsplit('-')[0:-1]  # Split by '-' and remove the last element
            sentence = " ".join(parts)  # Join parts to form a sentence
            processed_sentence = nlp(sentence)  # Process with SpaCy
            processed_texts.append(" ".join(token.text for token in processed_sentence))
        else:
            processed_texts.append("")  # Append an empty string if no processing was done

    # Use .loc to avoid SettingWithCopyWarning
    df = df.copy()  # Create a copy to avoid SettingWithCopyWarning if df is a slice
    df.loc[:, 'processed_url_text'] = processed_texts

    # Update the original DataFrame dictionary with the filtered DataFrame
    df_updated = df
    
    return df

### Applying the function title_input to df_clean, in order to have the titles as strings for analysis

In [14]:
df_clean = title_input(df_clean)

In [15]:
# Function to find the newspaper name in the URL
def find_newspaper(url):
    for newspaper in newsp:
        if newspaper in url:
            return newspaper
    return None  # Return None if no newspaper is found

In [16]:
df_clean['newspaper'] = df_clean['url'].apply(find_newspaper)

In [17]:
df_clean.head()

Unnamed: 0,url,timestamp,processed_url_text,newspaper
0,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200426174855,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/
1,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200427174619,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/
2,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200428171922,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/
3,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200429172316,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/
4,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200430181128,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/


### Function to create 'arquivo' links, to then store in specific columns

In [18]:
def combine_url_and_timestamp(row):
    """
    Combines the base URL with the 'url' and 'timestamp' columns from a DataFrame row.
    
    :param row: A row from a pandas DataFrame.
    :return: A combined URL string.
    """
    base_url = "https://arquivo.pt/noFrame/replay/"
    return f"{base_url}{row['timestamp']}id_/{row['url']}"

In [19]:
df_clean['arquivo_url'] = df_clean.apply(combine_url_and_timestamp, axis=1)

In [20]:
# Save the DataFrame to a CSV file, stored with arquivo link
df_clean.to_csv('full_ds_title_clean.csv', index=False, encoding='utf-8')

In [21]:
# Function to filter the dataframe, for links with "chega" and "andre ventura"
def filter_dataframe(df, text_column="processed_url_text"):
    # List to keep track of row indices that meet the criteria
    indices_to_keep = []

    # Iterate over each row in the DataFrame to access both the index and text
    for index, row in df.iterrows():
        text = row[text_column]
        
        # Skip if the text is NaN
        if pd.isna(text):
            continue
        
        doc = nlp(text)

        # Check if "chega" appears as a noun in the document
        is_chega_noun = any(token.text.lower() == "chega" and token.pos_ == "NOUN" for token in doc)

        # Check if both "andre" and "ventura" appear in the document
        contains_andre_ventura = "andre" in text.lower() and "ventura" in text.lower()

        # If either condition is met, keep the row index
        if is_chega_noun or contains_andre_ventura:
            indices_to_keep.append(index)

    # Filter the DataFrame to only include rows that meet the criteria
    df = df.loc[indices_to_keep].reset_index(drop=True)
    
    return df


### Applying the function to assess the stored title to distinguish between 'chega' noun and 'chega' verb

In [22]:
df_work = filter_dataframe(df_clean)

In [23]:
# Save the DataFrame to a CSV file
df_work.to_csv('working_dataset.csv', index=False, encoding='utf-8')

In [24]:
# Start from this point
df = pd.read_csv('working_dataset.csv')

### Functions to scrape links (url and arquivo url in case is needed) using newspaper package

In [25]:
# Function to scrape and parse an article
def scrape_article(row):
    """
    Fetches and parses the article content from the given URL.

    :param url: URL of the article.
    :return: A dictionary with the article's title, authors, publish date, and text.
    """
    article = Article(row['url'])
    article.download()  # Download the article content
    article.parse()     # Parse the downloaded content

    return {
        "title": article.title,
        "authors": article.authors,
        "publish_date": article.publish_date,
        "text": article.text
    }

#### Troubleshooting a link that is not read using newspaper package. Likely need to use beautiful soup in these cases

In [26]:
# Function to scrape and parse an article
def scrape_article2(str):
    """
    Fetches and parses the article content from the given URL.

    :param url: URL of the article.
    :return: A dictionary with the article's title, authors, publish date, and text.
    """
    article = Article(str)
    article.download()  # Download the article content
    article.parse()     # Parse the downloaded content

    return {
        "title": article.title,
        "authors": article.authors,
        "publish_date": article.publish_date,
        "text": article.text
    }

In [27]:
import requests
from bs4 import BeautifulSoup
import time

def get_paragraphs(url):
    """
    Fetches and retrieves all <p> elements from a newspaper page using BeautifulSoup.

    :param url: URL of the newspaper page.
    :return: A list of text content from all <p> elements on the page.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    try:
        # Send an HTTP GET request to the newspaper URL
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an error for bad responses
        
        # Introduce a small delay to ensure content is fully loaded
        time.sleep(2)

        # Parse the page content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'lxml')

        # Extract all <p> elements from the entire page
        paragraphs = soup.find_all('p')
        paragraph_texts = [p.get_text(strip=True) for p in paragraphs]

        return paragraph_texts

    except requests.RequestException as e:
        print(f"Error fetching the page at {url}: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred for URL {url}: {e}")
        return []

#Example usage:
url = "https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html"
paragraphs = get_paragraphs(url)
for para in paragraphs:
    print(para)


In [28]:
paragraphs

[]

In [29]:
def scrape_archive_page(row):
    """
    Fetches and parses content from an archived web page, ensuring the final archived URL is used.
    
    :param url: URL of the archived page.
    :param timestamp: Timestamp of the archived URL (from the Arquivo.pt dataset).
    :return: A dictionary with the article's title, authors, publish date, and text.
    """
    try:
        # Set up Selenium WebDriver
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")  # Run in headless mode (no GUI)
        options.add_argument("--disable-gpu")  # Disable GPU for performance
        options.add_argument("--no-sandbox")  # Required for some server environments
        options.add_argument("--blink-settings=imagesEnabled=false")  # Disable images
        options.page_load_strategy = "eager"  # Stop loading after DOMContentLoaded

        driver = webdriver.Chrome(options=options)
        driver.set_page_load_timeout(30)  # Increase page load timeout to 30 seconds

        # Load the initial URL
        driver.get(row['url'])

        # Wait for the page to load fully
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))  # Wait up to 20 seconds
        )

        # Get the final archived URL after all redirects
        final_archived_url = driver.current_url
        print(f"Final archived URL: {final_archived_url}")

        # If the final URL points to a live site, reconstruct the Arquivo.pt link
        if "arquivo.pt" not in final_archived_url:
            final_archived_url = f"https://arquivo.pt/noFrame/replay/{row['timestamp']}/{final_archived_url}"
            print(f"Reconstructed archived URL: {final_archived_url}")

        # Close the driver
        driver.quit()

        # Use newspaper3k to extract content from the reconstructed archived URL
        article = Article(final_archived_url)
        article.download()  # Download content from the final archived URL
        article.parse()  # Parse the downloaded content

        return {
            "title": article.title,
            "authors": article.authors,
            "publish_date": article.publish_date,
            "text": article.text
        }
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return {
            "title": None,
            "authors": None,
            "publish_date": None,
            "text": None
        }

### Function using the article and arquivo link, and organized to avoid scraping the same article multiple times

In [30]:
def scrape_with_duplicates_handling(df):
    """
    Scrapes data from a dataset while avoiding duplicates.
    If a row is processed successfully, subsequent duplicates are skipped.
    
    :param df: Input dataframe with 'url' and other necessary columns.
    :return: A new dataframe with the scraped data.
    """
    processed_urls = set()  # To track processed URLs
    results = []  # To store the scraped results
    
    for _, row in df.iterrows():
        url = row['url']  # Extract the URL from the current row
        
        if url in processed_urls:
            print(f"Skipping duplicate URL: {url}")
            continue  # Skip duplicates
        
        # Attempt to scrape the data
        try:
            scraped_data = scrape_article(row)  # Try live article scraping
            if not scraped_data['title']:  # If scrape_article fails to retrieve meaningful data
                raise ValueError("Empty title from scrape_article")
        except Exception as e:
            print(f"scrape_article failed for URL {url}: {e}")
            try:
                scraped_data = scrape_archive_page(row)  # Fall back to archived page scraping
            except Exception as archive_error:
                print(f"scrape_archive_page also failed for URL {url}: {archive_error}")
                scraped_data = {
                    "title": None,
                    "authors": None,
                    "publish_date": None,
                    "text": None
                }
        
        # If scraping succeeds, add URL to processed and save data
        processed_urls.add(url)
        results.append({**row.to_dict(), **scraped_data})  # Combine original row with scraped data
    
    # Create a new dataframe with the results
    return pd.DataFrame(results)

In [31]:
# Apply the function to the dataset
new_df = scrape_with_duplicates_handling(df)

# Save the new dataset to a file (optional)
new_df.to_csv("scraped_data.csv", index=False)

scraped_df = new_df = pd.read_csv('scraped_data.csv')

scraped_df.head()

Skipping duplicate URL: https://www.cmjornal.pt/politica/amp/andre-ventura-do-chega-quer-camaras-nas-fardas-dos-policias
Skipping duplicate URL: https://www.cmjornal.pt/politica/amp/aprovada-proposta-do-chega-para-divulgar-financiamentos-publicos-a-fundacoes-e-associacoes
Skipping duplicate URL: https://www.cmjornal.pt/politica/amp/aprovado-voto-do-chega-para-condenar-agressoes-a-professora-e-assistente-em-setubal
Skipping duplicate URL: https://www.cmjornal.pt/politica/amp/aprovado-voto-do-chega-para-condenar-agressoes-a-professora-e-assistente-em-setubal
Skipping duplicate URL: https://www.cmjornal.pt/politica/amp/aprovado-voto-do-chega-para-condenar-agressoes-a-professora-e-assistente-em-setubal
Skipping duplicate URL: https://www.cmjornal.pt/politica/amp/aprovado-voto-do-chega-para-condenar-agressoes-a-professora-e-assistente-em-setubal
Skipping duplicate URL: https://www.cmjornal.pt/politica/amp/aprovado-voto-do-chega-para-condenar-agressoes-a-professora-e-assistente-em-setubal
Sk

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache
Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x000000010072fac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000100728314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001001904b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x00000001001898c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x000000010017b9f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x000000010017d3b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x000000010017bda4 cxxbridge1$string$len + 5496
7   chromedriver                        0x000000010017b604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x0000000102b27ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000102b20314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001025884b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x00000001025818c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x00000001025739f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x00000001025753b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x0000000102573da4 cxxbridge1$string$len + 5496
7   chromedriver                        0x0000000102573604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x0000000104673ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x000000010466c314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001040d44b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x00000001040cd8c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x00000001040bf9f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x00000001040c13b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x00000001040bfda4 cxxbridge1$string$len + 5496
7   chromedriver                        0x00000001040bf604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x0000000103593ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x000000010358c314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x0000000102ff44b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000102fed8c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x0000000102fdf9f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x0000000102fe13b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x0000000102fdfda4 cxxbridge1$string$len + 5496
7   chromedriver                        0x0000000102fdf604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache
There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x000000010548bac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000105484314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x0000000104eec4b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104ee58c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x0000000104ed79f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x0000000104ed93b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x0000000104ed7da4 cxxbridge1$string$len + 5496
7   chromedriver                        0x0000000104ed7604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache
There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x0000000102a5bac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000102a54314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001024bc4b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x00000001024b58c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x00000001024a79f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x00000001024a93b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x00000001024a7da4 cxxbridge1$string$len + 5496
7   chromedriver                        0x00000001024a7604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x000000010529fac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000105298314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x0000000104d004b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104cf98c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x0000000104ceb9f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x0000000104ced3b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x0000000104cebda4 cxxbridge1$string$len + 5496
7   chromedriver                        0x0000000104ceb604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x000000010076fac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000100768314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001001d04b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x00000001001c98c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x00000001001bb9f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x00000001001bd3b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x00000001001bbda4 cxxbridge1$string$len + 5496
7   chromedriver                        0x00000001001bb604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Final archived URL: https://www.sapo.pt/noticias/atualidade/hoje-comeca-um-novo-dia-para-o-chega-andre_6150bf9cd1bccf29fd85374b
Reconstructed archived URL: https://arquivo.pt/noFrame/replay/20210926203554/https://www.sapo.pt/noticias/atualidade/hoje-comeca-um-novo-dia-para-o-chega-andre_6150bf9cd1bccf29fd85374b
Skipping duplicate URL: https://www.sapo.pt/amp/noticias/atualidade/ii-convencao-nacional-do-chega-entre-19-e-20-_5f29a6f3b10aaa4517e9669d
scrape_article failed for URL https://www.sapo.pt/amp/noticias/atualidade/lider-do-chega-acusa-costa-de-imitar-marcelo-_5fe73bc0353f6d5d07541791: Article `download()` failed with HTTPSConnectionPool(host='www.sapo.pt', port=443): Max retries exceeded with url: /amp/noticias/atualidade/lider-do-chega-acusa-costa-de-imitar-marcelo-_5fe73bc0353f6d5d07541791 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x12371d220>, 'Connection to www.sapo.pt timed out. (connect timeout=7)')) on URL https://www.sapo.pt/amp/noticias

There was an error managing chromedriver (error decoding response body); using driver found in the cache


Final archived URL: https://www.sapo.pt/noticias/atualidade/lider-do-chega-acusa-costa-de-imitar-marcelo-_5fe73bc0353f6d5d07541791
Reconstructed archived URL: https://arquivo.pt/noFrame/replay/20201226220018/https://www.sapo.pt/noticias/atualidade/lider-do-chega-acusa-costa-de-imitar-marcelo-_5fe73bc0353f6d5d07541791
Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Article `download()` failed with HTTPSConnectionPool(host='arquivo.pt', port=443): Read timed out. (read timeout=7) on URL https://arquivo.pt/noFrame/replay/20201226220018/https://www.sapo.pt/noticias/atualidade/lider-do-chega-acusa-costa-de-imitar-marcelo-_5fe73bc0353f6d5d07541791
Skipping duplicate URL: https://www.sapo.pt/amp/noticias/atualidade/lider-do-chega-acusa-costa-de-imitar-marcelo-_5fe73bc0353f6d5d07541791
scrape_article failed for URL https://www.sapo.pt/amp/noticias/atualidade/lider-do-chega-diz-que-orcamento-do-est

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x0000000101383ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x000000010137c314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x0000000100de44b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000100ddd8c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x0000000100dcf9f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x0000000100dd13b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x0000000100dcfda4 cxxbridge1$string$len + 5496
7   chromedriver                        0x0000000100dcf604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Final archived URL: https://www.sapo.pt/amp/noticias/atualidade/lider-do-chega-diz-que-portugal-nao-pode_5f8b94a75deed5242bc4a0d4
Reconstructed archived URL: https://arquivo.pt/noFrame/replay/20201019001416/https://www.sapo.pt/amp/noticias/atualidade/lider-do-chega-diz-que-portugal-nao-pode_5f8b94a75deed5242bc4a0d4
Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Article `download()` failed with HTTPSConnectionPool(host='arquivo.pt', port=443): Max retries exceeded with url: /noFrame/replay/20201019001416/https://www.sapo.pt/amp/noticias/atualidade/lider-do-chega-diz-que-portugal-nao-pode_5f8b94a75deed5242bc4a0d4 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x123733dc0>: Failed to resolve 'arquivo.pt' ([Errno 8] nodename nor servname provided, or not known)")) on URL https://arquivo.pt/noFrame/replay/20201019001416/https://www.sapo.pt/amp/noticias/atualidade

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x0000000100fabac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000100fa4314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x0000000100a0c4b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000100a058c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x00000001009f79f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x00000001009f93b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x00000001009f7da4 cxxbridge1$string$len + 5496
7   chromedriver                        0x00000001009f7604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x00000001047d3ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x00000001047cc314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042344b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x000000010422d8c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x000000010421f9f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x00000001042213b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x000000010421fda4 cxxbridge1$string$len + 5496
7   chromedriver                        0x000000010421f604 cxxbridge1$string$len + 3544
8   chromedriver                        0

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


Error processing URL https://www.jn.pt/nacional/costa-responde-a-cavaco-perigoso-e-a-direita-democratica-condicionar-se-pelo-chega-15853782.html: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=131.0.6778.85)
Stacktrace:
0   chromedriver                        0x000000010277fac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000102778314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001021e04b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x00000001021d98c8 cxxbridge1$string$len + 61596
4   chromedriver                        0x00000001021cb9f8 cxxbridge1$string$len + 4556
5   chromedriver                        0x00000001021cd3b0 cxxbridge1$string$len + 11140
6   chromedriver                        0x00000001021cbda4 cxxbridge1$string$len + 5496
7   chromedriver                        0x00000001021cb604 cxxbridge1$string$len + 3544
8   chromedriver                        0

Unnamed: 0,url,timestamp,processed_url_text,newspaper,arquivo_url,title,authors,publish_date,text
0,https://www.cmjornal.pt/politica/amp/andre-ven...,20200124190851,andre ventura do chega quer camaras nas fardas...,cmjornal.pt/,https://arquivo.pt/noFrame/replay/202001241908...,André Ventura do Chega quer câmaras nas fardas...,[],,O partido Chega apresentou um projeto de resol...
1,https://www.cmjornal.pt/politica/amp/aprovada-...,20200203181044,aprovada proposta do chega para divulgar finan...,cmjornal.pt/,https://arquivo.pt/noFrame/replay/202002031810...,Aprovada proposta do Chega para divulgar finan...,[],,A proposta do Chega para que seja pública a li...
2,https://www.cmjornal.pt/politica/amp/aprovado-...,20200206190139,aprovado voto do chega para condenar agressoes...,cmjornal.pt/,https://arquivo.pt/noFrame/replay/202002061901...,Aprovado voto do Chega para condenar agressões...,[],,O parlamento aprovou esta quinta-feira um voto...
3,https://www.cmjornal.pt/politica/amp/be-diz-qu...,20200205010909,be diz que nao ha gente seria na bancada do ch...,cmjornal.pt/,https://arquivo.pt/noFrame/replay/202002050109...,,,,
4,https://www.cmjornal.pt/politica/amp/comissao-...,20200227012044,comissao diz que projeto de castracao quimica ...,cmjornal.pt/,https://arquivo.pt/noFrame/replay/202002270120...,Comissão diz que projeto de castração química ...,[],,A Comissão de Assuntos Constitucionais decidiu...


### Function to remove instances that have a similarity above 50 %, using the fuzzy_wuzzy package

This came as a need to remove several links with very short text that were jeoperdizing the EDA

In [32]:
# To store removed instances
removed_instances = []

# Function to find and remove similar rows
def remove_similar_entries(df, threshold=50):
    indices_to_remove = set()
    for i, text1 in enumerate(df['text']):
        for j, text2 in enumerate(df['text']):
            if i != j and j not in indices_to_remove:
                similarity = fuzz.ratio(str(text1), str(text2))
                if similarity >= threshold:
                    indices_to_remove.add(j)
                    removed_instances.append({"Index": j, "Text": text2, "Similarity (%)": similarity})
    # Drop the rows from the DataFrame
    df = df.drop(list(indices_to_remove))
    return df

# Apply the function
cleaned_df = remove_similar_entries(new_df)
print(cleaned_df)

# Save the cleaned dataset
cleaned_df.to_csv("cleaned_dataset.csv", index=False)

# Print removed instances
removed_df = pd.DataFrame(removed_instances)
#removed_instances_df.to_csv("removed_instances.csv", index=False)

removed_df.drop_duplicates(subset=['Text'])

In [None]:
removed_df = pd.read_csv('/Users/joaop.cardoso/MestradoCD/FCD/FDS_Project/archive/removed_instances.csv')

removed_df.drop_duplicates(subset=['Text'])

# Assuming 'Index' is a column in removed_df and contains the indices you need from scraped_df
indices_to_retrieve = removed_df['Index']

# Retrieve the rows from scraped_df using the indices in removed_df
retrieved_df = scraped_df.loc[indices_to_retrieve]

# Reset index if needed
retrieved_df.reset_index(drop=True, inplace=True)

retrieved_df.to_csv("retrieved_dataset.csv", index=False)

In [None]:
# Start from this point
cleaned_df = pd.read_csv('cleaned_dataset.csv')

removed_df = pd.read_csv('removed_instances.csv')
print(removed_df)

In [None]:
combined_df = pd.concat([cleaned_df, retrieved_df], ignore_index=True)

In [None]:
combined_df.to_csv("working_dataset.csv", index=False)


### Lemmatization of the final dataframe with scraped reviews

In [None]:
# Empty dictionary to introduce the tokens
lemmas_dict = {}

# Loop through each review along with its index
for index, row in cleaned_df.iterrows():
    text = row['text']  # Access the 'text' column
    if text is None or not isinstance(text, str):
        print(f"Skipping index {index} because the review is None")
        continue
    doc = nlp(text)
    lemmas = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in doc ]
    lemmas = [ word for word in lemmas if word not in stop_words and word not in punctuations ]
    lemmas = [word for word in lemmas if word not in {"em o", "por o", "", "“", "”"}]  # Additional filter
    lemmas_dict[index] = lemmas  # Store lemmas in the dictionary with index as key

# Convert dictionary to a series and assign as a new column in the DataFrame
combined_df['Lemmas'] = pd.Series(lemmas_dict)


In [None]:
# Ensure all values in 'Lemmas' column are lists
combined_df['Lemmas'] = combined_df['Lemmas'].apply(lambda x: x if isinstance(x, list) else [])

# Word Count
def w_counter(words):
    word_freq = Counter(words)
    common_words = word_freq.most_common()
    word_freq_dict = {"Word": [word for word, freq in common_words], "Frequency": [freq for word, freq in common_words]}
    return word_freq_dict

all_words = [word for sublist in combined_df['Lemmas'] for word in sublist]

word_freq = pd.DataFrame(w_counter(all_words))

# Select the top 10 words by frequency
for index, row in top_10_words.iterrows():
    word = row[0]  # Assuming the word is in the first column of the DataFrame
    frequency = row[1]  # Assuming the frequency is in the second column
    print(f'"{word}": {frequency}')
print(top_10_words)
# Plot the top 10 words
plt.figure(figsize=(10, 6))
plt.bar(top_10_words['Word'], top_10_words['Frequency'], color='skyblue')
plt.title("Top 10 Most Frequently Used Words")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def extract_ngrams(texts, n=2):
    """
    Extract n-grams (bigrams or trigrams) from a list of texts.
    
    Parameters:
    texts (list): List of sentences or reviews to extract n-grams from.
    n (int): Number of words in each n-gram (e.g., 2 for bigrams, 3 for trigrams).
    
    Returns:
    Counter: Frequency count of n-grams.
    """
    ngrams = []
    for doc in nlp.pipe(texts):  # Process each text in the list with SpaCy
        tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
        # ngrams += zip(*[tokens[i:] for i in range(n)])  # Create n-grams

        # This way we ensure the n grams are all of the specified length
        ngrams += [ngram for ngram in zip(*[tokens[i:] for i in range(n)]) if len(ngram) == n] 
    
    return Counter(ngrams)

# Example usage with bigrams and trigrams

combined_df['Lemma_String'] = combined_df['Lemmas'].apply(lambda lemmas: " ".join(lemmas))

# Extract bigrams
bigram_counts = extract_ngrams(combined_df['Lemma_String'], n=2)
print("Top 10 Bigrams:", bigram_counts.most_common(10))

# Extract trigrams
trigram_counts = extract_ngrams(combined_df['Lemma_String'], n=3)
print("Top 10 Trigrams:", trigram_counts.most_common(10))


In [None]:
print("Top 10 Bigrams:", bigram_counts.most_common(50))


print("Top 10 Trigrams:", trigram_counts.most_common(50))

Bear in mind that a lot of the bigrams and trigrams aren't relevant (they're associated with subscription, reading, and a lot of them have the same number of hits, which means it might make sense to remove them to see what is beneath.)

In [None]:
least_common_bigrams = sorted(bigram_counts.items(), key=lambda x: x[1])[:50]
print("Least 50 Common Bigrams:", least_common_bigrams)

# Display the least common trigrams
least_common_trigrams = sorted(trigram_counts.items(), key=lambda x: x[1])[:50]
print("Least 50 Common Trigrams:", least_common_trigrams)