# Preprocessing

In [None]:
import glob
import pickle
import pandas as pd
import re
import dateparser
import spacy

In [None]:
df = pd.read_csv("dataset.csv")

### Some preliminary stuff

# unify outlet names
df.loc[df["page"].str.startswith('Canal13'), "page"] = "Canal13"
df.loc[df["page"].str.startswith('100% Noticias'), "page"] = "100% Noticias"
df.loc[df["page"].str.startswith('Confidencial'), "page"] = "Confidencial"
df.loc[df["page"].str.startswith('Radio Corporacion'), "page"] = "Radio Corporacion"

# extract canal13 dates 
df.loc[df["page"] == 'Canal13', ["date"]] = df.loc[df["page"] == 'Canal13',
                                                 "url"].str.extract(r'(\d\d\d\d/\d\d/\d\d)').values
# extract canal 6 dates
df.loc[df["page"] == 'Canal6', ["date"]] = df.loc[df["page"] == 'Canal6',

                                                "url"].str.extract(r'(\d\d\d\d/\d\d/\d\d)').values
# extract radio 800 dates
df.loc[df["page"] == 'Radio 800', ["date"]] = df.loc[df["page"] == 'Radio 800',
                                                   "url"].str.extract(r'(\d\d\d\d/\d\d/\d\d)').values

# convert canal10 date to datetime
df10 = df.loc[df["page"] == "Canal10"]
df10.loc[:,'date'] = df10['date'].str.replace('de ', '', regex=True)
df10.loc[:,'date'] = df10['date'].str.replace(r'Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday', '', regex=True)
df10.loc[:,'date'] = pd.to_datetime(df10['date'])
df.loc[df["page"] == "Canal10", 'date'] = df10["date"].to_list()

# convert canal14 date to datetime
# I am doing that with dateparser, because pandas cannot handle the spanish dates
df14 = df.loc[df["page"] == "Canal14"]
df14.loc[:,'date'] = df14.loc[:,'date'].apply(lambda x: dateparser.parse(x, settings={'STRICT_PARSING': True}))
df.loc[df["page"] == "Canal14", 'date'] = df14["date"].to_list()

# 100 % noticias
df100 = df.loc[df["page"] == "100% Noticias"]
# convert nan in date column to empty string so that dateparser works
df100.fillna({"date":" "}, axis=0, inplace=True)
df100.loc[:,'date'] = df100.loc[:,'date'].apply(lambda x: dateparser.parse(x, settings={'STRICT_PARSING': True}))
df.loc[df["page"] == "100% Noticias", 'date'] = df100["date"].to_list()

# canal2, canal4, confidencial, radio corporacion, radio nicaragua,
# radio primerissima are already in datetime format

# convert the rest to datetime
df.loc[:,"date"] = pd.to_datetime(df["date"])
# create year variable
df["year"] = pd.DatetimeIndex(df.date).year

df.to_csv("dataset1.csv", index = False)

## Cleaning Data

### Pre-Tokenization Cleanup

In [None]:
df = pd.read_csv("dataset1.csv")

In [None]:
# check for articles that do not have any text in them
print("Number of articles without text per outlet")
print(df[df.text.isna()].groupby('page').size())

# there are quite some articles without text. I randomly selected some of these to verify that
# this is not a mistake in my crawler, but that they just don't have text. Indeed, they are all
# articles that only contain videos. This is reflected by the fact that the outlets in thise list
# are mostly TV and radio stations.

print("\nNumber of articles without text per year")
print(df[df.text.isna()].groupby('year').size())

# Because the models need text to work, I will discard rows without text

df = df.loc[~df.text.isna()].reset_index(drop = True)

# removing boilerplate, news agency sources, hyperlinks etc.

             # newlines, tabs etc.
repl_dict = {r"\t|\n|\r|\xa0":" ",
             # whitespace
             r"\s{2,}":" ",
             # hashtags
             "#": "",
             # source in some articles
            r'Fuente: El 19 Digital|Fuente: TN8':"",
             # all hyperlinks
            r"http\S+":"",
             # canal4 boilerplate
            r"Canal 4 Noticias[\s\S]+Canal 4 Nicaragua. Todos los derechos reservados": "",
            "Comparte[.\s]*?esto:[.\s]*?Tweet[.\s]*?WhatsApp[.\s]*?Telegram": "",
            "LEER TAMBIÉN": "",
            "Leer más:": "",
            "AMPLIACIÓN EN BREVE…": "",
             # news agency
            "(EFE)":"",
             # source mentioned at end of article
            r"Con información de\:+$": "",
             # copyright stuff
            "© 100% Noticias ¡Con primicias a toda hora!":"",
            "© Getty Images":"",
            r"©\s?[Vv]iva [Nn]icaragua,? (Canal 13 )?(Previous Next)?": "",
            "© AFP":"",
            "© AP":"",
            "© creative commons": "",
            "© El 19 Digital": "",
            "© Consejo de Comunicación y Ciudadanía": "",
            "© Juventud Presidente": "",
            "© Ministerio de Gobernación": "",
             # copyrights for photographers
            r"\w+?\s+?\w+?\s+?©":"",
             # article suggestions
            "Te recomendamos:": "",
            "Quizás te interesa:":"",
            "Lee Aquí:": "",
             # twitter links
            r"pic\.twitter\.com.+?\d{4}": "",
            r"—.+?\(@.+?\).+?\d{4}":"",
             # source information
            r"Con información de:.+$": ""}

df.replace({"text":repl_dict, "title":repl_dict}, regex=True, inplace = True)
df

df.to_csv("dataset_token_ready.csv", index = False)

### Some Descriptives

In [None]:
# number of articles per outlet
df.groupby('page').size()

## Tokenization

In [None]:
# save only text as csv file
df = pd.read_csv("dataset_token_ready.csv")["text"]

df.to_csv("texts.csv", index = False)

In [None]:
# load text as generator
csv_gen = (row for row in open("texts.csv"))

In [None]:
# load text as list
textlist = pd.read_csv("dataset_token_ready.csv")["text"].tolist()

In [None]:
# load spanish language model
nlp = spacy.load('es_core_news_md', disable=["tok2vec", "tagger", "parser", "attribute_ruler", "ner"])
# add stopwords
nlp.Defaults.stop_words |= {"a","y", "o"}
# reload language model to incorporate new stopwords 
nlp = spacy.load('es_core_news_md', disable=["tok2vec", "tagger", "parser", "attribute_ruler", "ner"])

tokenlist = []
# go through rows of article texts
for i, doc in enumerate(nlp.pipe(csv_gen, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "ner"], n_process=4)):
    # append lists of lemmatised tokens to tokenlist
    tokenlist.append([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])
    if i % 10000 == 0 and i != 0: print(i)
# remove first entry, which is the column title
tokenlist = tokenlist[1:]
# save to pickle
with open(f"tokens/tokens.pkl", "wb") as f:
    pickle.dump(tokenlist, f)

In [25]:
# integrate tokenlist into df, useful for later structural topic model and all subsampling
# load tokenlist
with open("tokens/tokens.pkl", "rb") as f:
    tokenlist = pickle.load(f)

# integrate tokenlist back into dataframe
df = pd.read_csv("dataset_token_ready.csv")
df = df.assign(tokens=tokenlist)

# save dataset as pickle (csv does not work, because it cannot save the lists of tokens as cell entries)
df.to_pickle("dataset_with_tokens.pkl")