## Initial analysis of the collected data (number of counts, period for the timestamps)

In [3]:
import pandas as pd
from IPython.display import display
import spacy
from spacy.tokens import Doc
#spacy.cli.download("pt_core_news_sm")
from spacy.lang.pt.examples import sentences 


In [4]:
nlp = spacy.load('pt_core_news_sm')

## Count the number of occurrences of each entry and remove the duplicates

In [5]:
# Create a DataFrame to store the data analysed

df = pd.DataFrame({
    "Newspaper": [],
    "Number of occurences": [],
    "Period": []
    })


# Newspapers to search

newsp = ['cmjornal.pt/', 
         'dn.pt/',
         'expresso.pt/',
         'folhanacional.pt/',
         'jn.pt/',
         'ionline.sapo.pt/',   
         'sol.sapo.pt/',
         'observador.pt/',
         'publico.pt/',
         'sabado.pt/',
         'sapo.pt/',
         'visao.pt/',
         ]

# Years of the analysis

years = ['2019', '2020-2021', '2022-2024']

# Counting the number of occurrences for each of the newspapers in the defined periods

for y in years:
    with open("/Users/joaop.cardoso/MestradoCD/FCD/FDS_Project/cdx_results_json_files/cdx_results_"+y+".json", "r") as f:
        data = f.read()
        for i in newsp:
                total = data.count(i)
                df.loc[len(df)] = [i, total, y]

# Count the number of occurrences of each item in the URL column and save it to the dataframe w/o the timestamp

yearly_data = {}
yearly_data_no_dupl = {}
df_no_dupl = {}

for y in years:
    # Read the dataframe, count the number of URLs, merge the count to origina DF and then remove all duplicates based on URL
    df = pd.read_json("/Users/joaop.cardoso/MestradoCD/FCD/FDS_Project/cdx_results_json_files/cdx_results_"+y+".json")

    # Count the repeated urls in each of the .json files
    url_count = df.groupby(df['url']).size().reset_index(name = 'url count')

    # Merge the column (list) of counted values per url into the original DF
    yearly_data[f"df_{y}"] = df.merge(url_count, on = 'url', how = 'left')
    
    # Add the dataframes into a new dictionary
    yearly_data_no_dupl[f"df_{y}"] = yearly_data[f"df_{y}"].drop_duplicates(subset = ['url'], keep = 'first')

yearly_data_no_dupl['df_2019'].tail()

Unnamed: 0,url,timestamp,status,url count
13877,https://www.sapo.pt/prime/article/fc-porto-che...,1970-08-22 16:33:44.171433,200,127
13878,https://www.sapo.pt/prime/article/fc-porto-che...,1970-08-22 16:33:44.175956,200,1
13895,http://www.sapo.pt/prime/article/fc-porto-cheg...,1970-08-22 16:35:02.011304,200,2
13896,http://www.sapo.pt/prime/article/fc-porto-cheg...,1970-08-22 16:35:02.011308,200,2
13898,http://www.sapo.pt/prime/article/fc-porto-cheg...,1970-08-22 16:35:02.021135,200,2


## Separate the title from the rest of the URL to enable text analysis

In [6]:
# Testing the methods for splitting text w/in URL, and analyzing the word 'chega'

x = "http://www.sapo.pt/prime/article/fc-porto-chega-ao-classico-na-luz-ensombrado-_5d60e7d99474b37d1ce81d86//"

x1 = "https://www.cmjornal.pt/famosos/amp/irma-de-luciana-abreu-chega-de-fazer-sofrer-a-nossa-mae"

x2 = "O presidente chega ao local."

doc = nlp(x2)

y_sent = []

# The method rsplit splits the url string taking "-" as the delimiter. [1:-1] removes the first and last instance

last_part = x1.rsplit('/')
print(last_part)

for i in last_part:
    if "-" in i:
        y = i.rsplit('-')[0:-1]
        print(y)
        y_sent = nlp(" ".join(y))

for token in nlp.get_pipe("morphologizer")(y_sent):
    print(f"Word: {token.text}, POS: {token.pos_}, Detailed Tag: {token.tag_}")

for token in nlp.get_pipe("morphologizer")(doc):
    print(f"Word: {token.text}, POS: {token.pos_}, Detailed Tag: {token.tag_}")

['https:', '', 'www.cmjornal.pt', 'famosos', 'amp', 'irma-de-luciana-abreu-chega-de-fazer-sofrer-a-nossa-mae']
['irma', 'de', 'luciana', 'abreu', 'chega', 'de', 'fazer', 'sofrer', 'a', 'nossa']
Word: irma, POS: VERB, Detailed Tag: VERB
Word: de, POS: ADP, Detailed Tag: ADP
Word: luciana, POS: NOUN, Detailed Tag: NOUN
Word: abreu, POS: VERB, Detailed Tag: VERB
Word: chega, POS: NOUN, Detailed Tag: NOUN
Word: de, POS: SCONJ, Detailed Tag: SCONJ
Word: fazer, POS: VERB, Detailed Tag: VERB
Word: sofrer, POS: VERB, Detailed Tag: VERB
Word: a, POS: DET, Detailed Tag: DET
Word: nossa, POS: DET, Detailed Tag: DET
Word: O, POS: DET, Detailed Tag: DET
Word: presidente, POS: NOUN, Detailed Tag: NOUN
Word: chega, POS: VERB, Detailed Tag: VERB
Word: ao, POS: ADP, Detailed Tag: ADP
Word: local, POS: NOUN, Detailed Tag: NOUN
Word: ., POS: PUNCT, Detailed Tag: PUNCT


In [None]:
# Function to process the title
def title_input(df, year): 
    processed_texts = []  # Initialize inside the function

    # Extract the last part of each URL and process it with SpaCy
    last_part = df['url'].str.rsplit('/').str[-1]
    for part in last_part:
        if "-" in part:
            parts = part.rsplit('-')[0:-1]  # Split by '-' and remove the last element
            sentence = " ".join(parts)  # Join parts to form a sentence
            processed_sentence = nlp(sentence)  # Process with SpaCy
            processed_texts.append(" ".join(token.text for token in processed_sentence))
        else:
            processed_texts.append("")  # Append an empty string if no processing was done

    # Use .loc to avoid SettingWithCopyWarning
    df = df.copy()  # Create a copy to avoid SettingWithCopyWarning if df is a slice
    df.loc[:, 'processed_url_text'] = processed_texts
    
    # Drop duplicates based on the 'processed_url_text' column within the df DataFrame
    df = df.drop_duplicates(subset=['processed_url_text'], keep='first').reset_index(drop=True)
    
    # Update the original DataFrame dictionary with the filtered DataFrame
    yearly_data_no_dupl[f"df_{year}"] = df
    
    return df


In [None]:
# Function to filter the dataframe, for links with "chega" and "andre ventura"
def filter_dataframe(df, text_column="processed_url_text"):
    # List to keep track of row indices that meet the criteria
    indices_to_keep = []

    # Iterate over each row in the DataFrame to access both the index and text
    for index, row in df.iterrows():
        text = row[text_column]
        
        # Skip if the text is NaN
        if pd.isna(text):
            continue
        
        doc = nlp(text)

        # Check if "chega" appears as a noun in the document
        is_chega_noun = any(token.text.lower() == "chega" and token.pos_ == "NOUN" for token in doc)

        # Check if both "andre" and "ventura" appear in the document
        contains_andre_ventura = "andre" in text.lower() and "ventura" in text.lower()

        # If either condition is met, keep the row index
        if is_chega_noun or contains_andre_ventura:
            indices_to_keep.append(index)

    # Filter the DataFrame to only include rows that meet the criteria
    df = df.loc[indices_to_keep].reset_index(drop=True)
    
    return df


In [None]:
# Create a dictionary to store the filtered dataframes

filtered_dfs = {}
# Apply the title_input and filter_dataframe functions
for y in years:
    yearly_data_no_dupl[f"df_{y}"] = title_input(yearly_data_no_dupl[f"df_{y}"], y)
    filtered_dfs[f"filtered_df_{y}"] = filter_dataframe(yearly_data_no_dupl[f"df_{y}"])
    filtered_dfs[f"filtered_df_{y}"] = filtered_dfs[f"filtered_df_{y}"].rename(columns={"processed_url_text": "title"})

print(filtered_dfs['filtered_df_2019'])

In [13]:
# Function to find the newspaper name in the URL
def find_newspaper(url):
    for newspaper in newsp:
        if newspaper in url:
            return newspaper
    return None  # Return None if no newspaper is found

# Create the 'newspaper' column
for y in years:
    filtered_dfs[f"filtered_df_{y}"]['newspaper'] = filtered_dfs[f"filtered_df_{y}"]['url'].apply(find_newspaper)

filtered_dfs['filtered_df_2019']

Unnamed: 0,url,timestamp,status,url count,title,newspaper
0,https://www.cmjornal.pt/famosos/amp/irma-de-lu...,1970-08-22 16:23:46.194334,200,1,irma de luciana abreu chega de fazer sofrer a ...,cmjornal.pt/
1,https://www.cmjornal.pt/famosos/detalhe/irma-d...,1970-08-22 16:23:46.204109,200,1,irma de luciana abreu chega de fazer sofrer a ...,cmjornal.pt/
2,https://www.cmjornal.pt/opiniao/colunistas/edu...,1970-08-22 16:36:58.180706,200,2,rap promotor do chega de,cmjornal.pt/
3,https://www.cmjornal.pt/politica/amp/andre-ven...,1970-08-22 16:37:08.191131,200,6,andre ventura diz chega vai impedir extrema di...,cmjornal.pt/
4,https://www.cmjornal.pt/politica/amp/andre-ven...,1970-08-22 16:37:06.194953,200,2,andre ventura do chega nao vai a posse do gove...,cmjornal.pt/
...,...,...,...,...,...,...
83,https://ionline.sapo.pt/artigo/679333/projeto-...,1970-08-22 16:40:07.181559,200,1,projeto de castracao quimica do chega ja foi e...,ionline.sapo.pt/
84,https://www.sapo.pt/noticias/amp/nacional/depu...,1970-08-22 16:38:41.195155,200,1,deputado do chega e recebido com aplausos,sapo.pt/
85,https://www.sapo.pt/noticias/motores/skoda-kos...,1970-08-22 16:22:04.183553,200,3,skoda kosmiq o suv urbano da skoda chega,sapo.pt/
86,https://www.sapo.pt/noticias/nacional/listas-d...,1970-08-22 16:33:25.172133,200,1,listas do chega vao integrar elementos,sapo.pt/
