# Preprocessing

In [28]:
import glob
import pickle
import pandas as pd
import re
import dateparser

In [29]:
# putting individual lists of lists into single dataframe
df = pd.DataFrame(columns = ['page', 'date', 'title', 'text', 'url'])
for file in [x for x in glob.glob("*") if x.endswith(".pkl")]:
    with open(file, "rb") as f:
        articles = pickle.load(f)
    name = file.split(".")[0]
    df = df.append(pd.DataFrame({"page":name, "title":articles[1],
                                 "text":articles[2], "date":articles[3],
                                 "url":articles[0]}))
    
df.reset_index(drop = True, inplace = True)

### Some preliminary stuff

In [30]:
# unify outlet names
df.loc[df["page"].str.startswith('Canal13'), "page"] = "Canal13"
df.loc[df["page"].str.startswith('100% Noticias'), "page"] = "100% Noticias"
df.loc[df["page"].str.startswith('Confidencial'), "page"] = "Confidencial"
df.loc[df["page"].str.startswith('Radio Corporacion'), "page"] = "Radio Corporacion"

In [31]:
# extract canal13 dates 
df.loc[df["page"] == 'Canal13_economia', "date"] = df.loc[df["page"] == 'Canal13_economia',
                                                          "url"].str.extract(r'(\d\d\d\d/\d\d/\d\d)').to_numpy()
# extract canal 6 dates
df.loc[df["page"] == 'Canal6', "date"] = df.loc[df["page"] == 'Canal6',
                                                "url"].str.extract(r'(\d\d\d\d/\d\d/\d\d)').to_numpy()
# extract radio 800 dates
df.loc[df["page"] == 'Radio 800', "date"] = df.loc[df["page"] == 'Radio 800',
                                                   "url"].str.extract(r'(\d\d\d\d/\d\d/\d\d)').to_numpy()

In [32]:
# convert canal10 date to datetime
df10 = df.loc[df["page"] == "Canal10"]
df10.loc[:,'date'] = df10['date'].str.replace('de ', '', regex=True)
df10.loc[:,'date'] = df10['date'].str.replace(r'Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday', '', regex=True)
df10.loc[:,'date'] = pd.to_datetime(df10['date'])
df.loc[df["page"] == "Canal10", 'date'] = df10["date"].to_list()

# convert canal14 date to datetime
# I am doing that with dateparser, because pandas cannot handle the spanish dates
df14 = df.loc[df["page"] == "Canal14"]
df14.loc[:,'date'] = df14.loc[:,'date'].apply(lambda x: dateparser.parse(x))
df.loc[df["page"] == "Canal14", 'date'] = df14["date"].to_list()

# 100 % noticias
df100 = df.loc[df["page"] == "100% Noticias"]
df100.loc[:,'date'] = df100.loc[:,'date'].apply(lambda x: dateparser.parse(x))
df.loc[df["page"] == "100% Noticias", 'date'] = df100["date"].to_list()

# canal2, canal4, confidencial, radio corporacion, radio nicaragua, radio primerissima are already in datetime format

# convert the rest to datetime
df.loc[:,"date"] = pd.to_datetime(df["date"])

  date_obj = stz.localize(date_obj)


In [33]:
df.to_csv("dataset.csv")

### Cleaning Boilerplate

### Some Descriptives

In [473]:
# number of articles per outlet
df.groupby('page').size()

page
100% Noticias           18675
Canal10                 12332
Canal13                 45992
Canal14                  6689
Canal2                   2636
Canal4                  15650
Canal6                   5219
Confidencial             8112
Radio 800                 805
Radio Corporacion        9428
Radio Nicaragua         21610
Radio la Primerisima    14982
dtype: int64