In [1]:
import pandas as pd
from IPython.display import display
import os
import json
import spacy
from spacy.tokens import Doc
from spacy.lang.pt.examples import sentences 
from newspaper import Article, Source

from string import punctuation
from collections import Counter
import matplotlib.pyplot as plt 
from spacy.lang.pt.stop_words import STOP_WORDS

import requests
from bs4 import BeautifulSoup
import time

#spacy.cli.download("pt_core_news_sm")

In [2]:
nlp = spacy.load('pt_core_news_sm') # commented out once it's loaded

punctuations = list(punctuation)

In [3]:
# Newspapers to search

newsp = ['cmjornal.pt/', 
         'dn.pt/',
         'expresso.pt/',
         'folhanacional.pt/',
         'jn.pt/',
         'ionline.sapo.pt/',   
         'sol.sapo.pt/',
         'observador.pt/',
         'publico.pt/',
         'sabado.pt/',
         'sapo.pt/',
         'visao.pt/',
         ]

not_keywords = ["/multimedia", "/videos", 
                "/famosos", "/celebridades", 
                "/tecnologia", "/boa-vida", 
                "/tendencias", "/desporto",
                "/maissobre", "/ciencia",
                "/cinema", "/cultura",
                "/cidades", "/dinheiro",
                "/blogues", "/podcasts",
                "/vida", "/artes",
                "/iniciativas", "/colunistas/andre",
                "/tribuna"]


In [4]:
def extract_data_from_json(folder_path):
    """
    Extracts 'url' and 'timestamp' from all JSON files in the specified folder and stores them in a DataFrame.
    
    :param folder_path: Path to the folder containing JSON files.
    :return: A pandas DataFrame with columns 'url' and 'timestamp'.
    """
    data = []

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a JSON file
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            # Open and load the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                try:
                    json_data = json.load(file)
                    # Check if the data is a list of dictionaries
                    if isinstance(json_data, list):
                        for entry in json_data:
                            # Extract 'url' and 'timestamp' if available
                            if 'url' in entry and 'timestamp' in entry:
                                data.append({
                                    'url': entry['url'],
                                    'timestamp': entry['timestamp']
                                })
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON file {file_name}: {e}")

    # Convert collected data into a DataFrame
    df = pd.DataFrame(data, columns=['url', 'timestamp'])
    return df


In [42]:
df = extract_data_from_json("/Users/joaop.cardoso/MestradoCD/FCD/FDS_Project/cdx_results_json_files/data")

In [6]:
# Use this command to avoid running the code above
df = pd.read_csv('full_dataset.csv')

In [5]:
def filter_links(df, keywords):
    """
    Filters out rows from the DataFrame by assessing each keyword individually.
    Returns the result as a dictionary.
    
    :param df: A pandas DataFrame with at least 'url' and 'timestamp' columns.
    :param keywords: A list of keywords to filter out.
    :return: A dictionary with 'url' and 'timestamp' for rows that do not contain any of the keywords.
    """
    filtered_df = df.copy()

    # Iterate over each keyword and filter rows
    for keyword in keywords:
        filtered_df = filtered_df[~filtered_df['url'].str.contains(keyword, case=False, na=False)]
    
    # Convert the filtered DataFrame to a dictionary
    result_dict = pd.DataFrame(filtered_df)
    return result_dict


In [7]:
df_clean = filter_links(df, not_keywords)

In [8]:
# Function to process the title
def title_input(df): 
    processed_texts = []  # Initialize inside the function

    # Extract the last part of each URL, handling NaN values
    last_part = df['url'].str.rsplit('/').str[-1]  # Extract the last part of the URL
    last_part = last_part.fillna("")  # Replace NaN values with an empty string

    # Handle cases where the last part is empty
    last_part[last_part == ""] = df['url'].str.rsplit('/').str[-2].fillna("")  # Use the second-to-last part if last is empty
    
    for part in last_part:
        if "-" in part:
            parts = part.rsplit('-')[0:-1]  # Split by '-' and remove the last element
            sentence = " ".join(parts)  # Join parts to form a sentence
            processed_sentence = nlp(sentence)  # Process with SpaCy
            processed_texts.append(" ".join(token.text for token in processed_sentence))
        else:
            processed_texts.append("")  # Append an empty string if no processing was done

    # Use .loc to avoid SettingWithCopyWarning
    df = df.copy()  # Create a copy to avoid SettingWithCopyWarning if df is a slice
    df.loc[:, 'processed_url_text'] = processed_texts

    # Update the original DataFrame dictionary with the filtered DataFrame
    df_updated = df
    
    return df

In [9]:
df_clean = title_input(df_clean)

In [11]:
# Function to find the newspaper name in the URL
def find_newspaper(url):
    for newspaper in newsp:
        if newspaper in url:
            return newspaper
    return None  # Return None if no newspaper is found

In [14]:
df_clean['newspaper'] = df_clean['url'].apply(find_newspaper)

In [17]:
df_clean.head()

Unnamed: 0,url,timestamp,processed_url_text,newspaper
0,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200426174855,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/
1,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200427174619,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/
2,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200428171922,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/
3,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200429172316,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/
4,https://www.cmjornal.pt/c-studio/especiais-c-s...,20200430181128,apoio domiciliario chega a 2200 pessoas por,cmjornal.pt/


In [18]:
def combine_url_and_timestamp(row):
    """
    Combines the base URL with the 'url' and 'timestamp' columns from a DataFrame row.
    
    :param row: A row from a pandas DataFrame.
    :return: A combined URL string.
    """
    base_url = "https://arquivo.pt/wayback/"
    return f"{base_url}{row['timestamp']}/{row['url']}"

In [19]:
df_clean['arquivo_url'] = df_clean.apply(combine_url_and_timestamp, axis=1)

In [5]:
# Function to filter the dataframe, for links with "chega" and "andre ventura"
def filter_dataframe(df, text_column="processed_url_text"):
    # List to keep track of row indices that meet the criteria
    indices_to_keep = []

    # Iterate over each row in the DataFrame to access both the index and text
    for index, row in df.iterrows():
        text = row[text_column]
        
        # Skip if the text is NaN
        if pd.isna(text):
            continue
        
        doc = nlp(text)

        # Check if "chega" appears as a noun in the document
        is_chega_noun = any(token.text.lower() == "chega" and token.pos_ == "NOUN" for token in doc)

        # Check if both "andre" and "ventura" appear in the document
        contains_andre_ventura = "andre" in text.lower() and "ventura" in text.lower()

        # If either condition is met, keep the row index
        if is_chega_noun or contains_andre_ventura:
            indices_to_keep.append(index)

    # Filter the DataFrame to only include rows that meet the criteria
    df = df.loc[indices_to_keep].reset_index(drop=True)
    
    return df


In [6]:
df_work = filter_dataframe(df_clean)

In [20]:
# Save the DataFrame to a CSV file
df_clean.to_csv('full_ds_title_clean.csv', index=False, encoding='utf-8')

In [4]:
# Start from this point
df_clean = pd.read_csv('full_ds_title_clean.csv')

In [None]:
# Save the DataFrame to a CSV file
df_work.to_csv('working_dataset.csv', index=False, encoding='utf-8')

In [9]:
# Function to scrape and parse an article
def scrape_article(url):
    """
    Fetches and parses the article content from the given URL.

    :param url: URL of the article.
    :return: A dictionary with the article's title, authors, publish date, and text.
    """
    article = Article(url)
    article.download()  # Download the article content
    article.parse()     # Parse the downloaded content

    return {
        "title": article.title,
        "authors": article.authors,
        "publish_date": article.publish_date,
        "text": article.text
    }

# Example usage
article_url = "https://www.publico.pt/2020/12/31/politica/noticia/andre-ventura-alega-estar-impedido-campanha-alerta-risco-perda-mandato-1944739"
article_data = scrape_article(article_url)

print(f"Title: {article_data['title']}")
print(f"Authors: {article_data['authors']}")
print(f"Publish Date: {article_data['publish_date']}")
print(f"Text: {article_data['text'][:500]}")  # Print the first 500 characters of the article

Title: André Ventura alega estar impedido de fazer campanha e alerta para risco de perda de mandato
Authors: ['Sofia Rodrigues']
Publish Date: 2020-12-31 00:00:00
Text: O deputado e candidato presidencial do Chega, André Ventura, interpôs junto do Supremo Tribunal Administrativo (STA) uma acção de intimação de comportamento ao presidente da Assembleia da República (AR), Eduardo Ferro Rodrigues, para que permita a suspensão do seu mandato parlamentar e a substituição por outro elemento do partido nas próximas semanas.

No processo a que o PÚBLICO teve acesso, o candidato à Presidência da República diz que está “impedido de poder exercer o seu direito a fazer cam


In [None]:
# Function to scrape the content of an Arquivo.pt link
def scrape_article_content(url):
    try:
        response = requests.get(url, timeout=50)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Find all <p> tags and concatenate their text
        paragraphs = soup.find_all('p')
        article_text = " ".join([p.get_text() for p in paragraphs])
        
        # Return the article text or None if no <p> elements are found
        return article_text if article_text.strip() else None

    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return None

In [None]:
# Function to scrape the content of an Arquivo.pt link
def scrape_article_content(url):
    try:
        response = requests.get(url, timeout=50)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Find all <p> tags and concatenate their text
        paragraphs = soup.find_all('p')
        article_text = " ".join([p.get_text() for p in paragraphs])
        
        # Return the article text or None if no <p> elements are found
        return article_text if article_text.strip() else None

    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return None

updated_dfs = {}
# Process each filtered DataFrame and update with Arquivo links and article content
for y in years:
    df = filtered_dfs[f"filtered_df_{y}"]

    # Generate Arquivo link for each row and scrape article content
    df['arquivo_url'] = df.apply(generate_arquivo_link, axis=1)
    df['article_text'] = df['url'].apply(scrape_article_content)

    # Optional: Add delay to avoid overloading the server and getting blocked
    time.sleep(1)

    # Update the filtered DataFrame in the dictionary
    updated_dfs[f"filtered_df_{y}"] = df

In [None]:
# Empty dictionary to introduce the tokens
lemmas_dict = {}

stop_words = STOP_WORDS

# Loop through each review along with its index
for index, review in enumerate(updated_dfs['filtered_df_2020-2021']['article_text']):
    if review is None:
        print(f"Skipping index {index} because the review is None")
        continue
    doc = nlp(review)
    lemmas = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in doc ]
    lemmas = [ word for word in lemmas if word not in stop_words and word not in punctuations ]
    lemmas_dict[index] = lemmas  # Store lemmas in the dictionary with index as key

# Convert dictionary to a series and assign as a new column in the DataFrame
updated_dfs['filtered_df_2020-2021']['Lemmas'] = pd.Series(lemmas_dict)


In [None]:
updated_dfs['filtered_df_2020-2021']['Lemmas']
filtered_df = updated_dfs['filtered_df_2020-2021'].dropna(subset=['article_text'])
print(filtered_df)

In [None]:
# Word Count
def w_counter(words):
    word_freq = Counter(words)
    common_words = word_freq.most_common()
    word_freq_dict = {"Word": [word for word, freq in common_words], "Frequency": [freq for word, freq in common_words]}
    return word_freq_dict

all_words = [word for sublist in filtered_df['Lemmas'] for word in sublist]

word_freq = pd.DataFrame(w_counter(all_words))

# Select the top 10 words by frequency
top_10_words = word_freq.head(10)

# Plot the top 10 words
plt.figure(figsize=(10, 6))
plt.bar(top_10_words['Word'], top_10_words['Frequency'], color='skyblue')
plt.title("Top 10 Most Frequently Used Words")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(word_freq)