## Initial analysis of the collected data (number of counts, period for the timestamps)

In [5]:
import pandas as pd
from IPython.display import display
import spacy
from spacy.tokens import Doc
#spacy.cli.download("pt_core_news_sm")
from spacy.lang.pt.examples import sentences 


In [6]:
nlp = spacy.load('pt_core_news_sm')

## Count the number of occurrences of each entry and remove the duplicates

In [7]:
# Create a DataFrame to store the data analysed

df = pd.DataFrame({
    "Newspaper": [],
    "Number of occurences": [],
    "Period": []
    })


# Newspapers to search

newsp = ['cmjornal.pt/', 
         'dn.pt/',
         'expresso.pt/',
         'folhanacional.pt/',
         'jn.pt/',
         'ionline.sapo.pt/',   
         'sol.sapo.pt/',
         'observador.pt/',
         'publico.pt/',
         'sabado.pt/',
         'sapo.pt/',
         'visao.pt/',
         ]

# Years of the analysis

years = ['2019', '2020-2021', '2022-2024']

# Counting the number of occurrences for each of the newspapers in the defined periods

for y in years:
    with open("/Users/joaop.cardoso/MestradoCD/FCD/FDS_Project/cdx_results_json_files/cdx_results_"+y+".json", "r") as f:
        data = f.read()
        for i in newsp:
                total = data.count(i)
                df.loc[len(df)] = [i, total, y]

# Count the number of occurrences of each item in the URL column and save it to the dataframe w/o the timestamp

yearly_data = {}
yearly_data_no_dupl = {}
df_no_dupl = {}

for y in years:
    # Read the dataframe, count the number of URLs, merge the count to origina DF and then remove all duplicates based on URL
    df = pd.read_json("/Users/joaop.cardoso/MestradoCD/FCD/FDS_Project/cdx_results_json_files/cdx_results_"+y+".json")

    # Count the repeated urls in each of the .json files
    url_count = df.groupby(df['url']).size().reset_index(name = 'url count')

    # Merge the column (list) of counted values per url into the original DF
    yearly_data[f"df_{y}"] = df.merge(url_count, on = 'url', how = 'left')
    
    # Add the dataframes into a new dictionary
    yearly_data_no_dupl[f"df_{y}"] = yearly_data[f"df_{y}"].drop_duplicates(subset = ['url'], keep = 'first')

yearly_data_no_dupl['df_2019'].tail()

Unnamed: 0,url,timestamp,status,url count
13877,https://www.sapo.pt/prime/article/fc-porto-che...,1970-08-22 16:33:44.171433,200,127
13878,https://www.sapo.pt/prime/article/fc-porto-che...,1970-08-22 16:33:44.175956,200,1
13895,http://www.sapo.pt/prime/article/fc-porto-cheg...,1970-08-22 16:35:02.011304,200,2
13896,http://www.sapo.pt/prime/article/fc-porto-cheg...,1970-08-22 16:35:02.011308,200,2
13898,http://www.sapo.pt/prime/article/fc-porto-cheg...,1970-08-22 16:35:02.021135,200,2


## Separate the title from the rest of the URL to enable text analysis

In [8]:
# Testing the methods for splitting text w/in URL, and analyzing the word 'chega'

x = "http://www.sapo.pt/prime/article/fc-porto-chega-ao-classico-na-luz-ensombrado-_5d60e7d99474b37d1ce81d86//"

x1 = "https://www.sapo.pt/noticias/nacional/ventura-cabeca-de-lista-do-chega-em-lisboa-e-_5d18ca3ad9048e7d170acddf"

x2 = "O presidente chega ao local."

y_sent = []

# The method rsplit splits the url string taking "-" as the delimiter. [1:-1] removes the first and last instance

last_part = x1.rsplit('/')
print(last_part)

for i in last_part:
    if "-" in i:
        y = i.rsplit('-')[0:-1]
        print(y)
        y_sent = nlp(" ".join(y))

print(y_sent)
for token in nlp.get_pipe("morphologizer")(y_sent):
    print(f"Word: {token.text}, POS: {token.pos_}, Detailed Tag: {token.tag_}")


['https:', '', 'www.sapo.pt', 'noticias', 'nacional', 'ventura-cabeca-de-lista-do-chega-em-lisboa-e-_5d18ca3ad9048e7d170acddf']
['ventura', 'cabeca', 'de', 'lista', 'do', 'chega', 'em', 'lisboa', 'e']
ventura cabeca de lista do chega em lisboa e
Word: ventura, POS: NOUN, Detailed Tag: NOUN
Word: cabeca, POS: ADJ, Detailed Tag: ADJ
Word: de, POS: ADP, Detailed Tag: ADP
Word: lista, POS: NOUN, Detailed Tag: NOUN
Word: do, POS: ADP, Detailed Tag: ADP
Word: chega, POS: NOUN, Detailed Tag: NOUN
Word: em, POS: ADP, Detailed Tag: ADP
Word: lisboa, POS: PROPN, Detailed Tag: PROPN
Word: e, POS: CCONJ, Detailed Tag: CCONJ
