# First Data Exploration

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import zipfile
from unidecode import unidecode

In [None]:
na_dict = {"AFP1": "AFP", 
           "AFP2": "AFP", 
           "ANP": "ANP", 
           "ANSA": "ANSA", 
           "AP1": "AP", 
           "AP2": "AP", 
           "APA": "APA", 
           "Associated_Press": "Associated Press", 
           "ATS1": "ATS",
           "Belga": "Belga", 
           "BTA": "BTA", 
           "CTK": "CTK", 
           "DDP-DAPD": "DDP-DAPD", 
           "DNB": "DNB",
           "Domei": "Domei", 
           "DPA": "DPA", 
           "Europapress": "Europapress", 
           "Extel": "Extel", 
           "Havas": "Havas",
           "Havasagentur": "Havas",
           "Interfax": "Interfax",
           "PAP": "PAP",
           "Reuter": "Reuters",
           "Reuters": "Reuters",
           "reutersche": "Reuters",
           "SPK": "SPK",
           "Stefani": "Stefani",
           "Tanjug": "Tanjug",
           "TASS": "TASS",
           "Telunion": "Telunion",
           "TT-Sweden": "TT",
           "UPI": "UP-UPI",
           "Wolff": "Wolff"
          
          }


#na_dict = {"Belga":"Belga"}

#all columns but "type" (empty) and last one (only title, indicating size of the collection)
cols = ['uid', 'language', 'title', 'size', 'country', 'newspaper',
       'issue', 'pages', 'nb_pages', 'relevance', 'year', 'is_on_front',
       'date', 'persons_mentioned', 'locations_mentioned', 'content',
       'access_right', 'content_provider', 'is_content_available',
       'collections']

In [None]:
#build full collection in one dataframe
df = pd.DataFrame()

for na_collection, na_name in na_dict.items():
    #import next news agency content
    path = "zips/" + na_collection + ".zip"
    temp = pd.read_csv(path, compression="zip", sep=";", usecols=cols)
    #save name of news agency
    temp["newsagency"] = na_name
    #add to rest
    df = pd.concat([df,temp])    

### Disambiguation of "Mixed" collection

In [None]:
mixed_nas = {"afpreuters": ["AFP", "Reuters"], 
             "afpreuter": ["AFP", "Reuters"], 
             "atsafp": ["ATS", "AFP"], 
             "atsreuters": ["ATS", "Reuters"], 
             "atsreuter": ["ATS", "Reuters"], 
             "atsjafp": ["ATS", "AFP"], 
             "atsap": ["ATS", "AP"], 
             "aplddp": ["AP", "DDP"], 
             "aplafp": ["AP", "AFP"],
             "afplap": ["AFP", "AP"], 
             "dpalafp": ["DPA", "AFP"], 
             "atsjreuter": ["ATS", "Reuters"], 
             "atsfafp": ["ATS", "AFP"], 
             "ddplap": ["DDP", "AP"], 
             "aplsda": ["AP", "ATS"], 
             "aplddp": ["AP", "DDP"], 
             "sdalafp": ["ATS", "AFP"], 
             "atsjred": ["ATS"], 
             "atsred": ["ATS"]}

In [None]:
len_before = len(df)

#import Mixed.zip file
mixed = pd.read_csv("zips/Mixed.zip", compression="zip", sep=";", usecols=cols)
mixed = mixed[mixed["content"].notna()]

#normalize content column in a copy of Mixed
mixed_no_acc = mixed.copy()
mixed_no_acc["content"] = mixed_no_acc["content"].apply(lambda x: unidecode(x))

count = 0
for mixed_word, mixed_na_list in mixed_nas.items():
    #get all entries with the mixed_word in it
    temp = mixed[mixed_no_acc["content"].str.contains(mixed_word, case=False, na=False)]
    
    #first store mixed_na_list in a new column, then create one row per na entry (via explode)
    temp =  temp.assign(newsagency = [mixed_na_list] * len(temp))
    temp = temp.explode("newsagency", ignore_index=True)
    count += len(temp)
    
    #concatenate to existing df
    df = pd.concat([df,temp])

#delete duplicates
df.drop_duplicates(inplace=True)

print(f"#articles in Mixed.zip: {len(mixed)}, #rows stored (with duplicates): {count},\n\
#rows stored (without duplicates): {len(df)-len_before}")

## add useful columns

In [None]:
#Pauline
def get_decade(year):
    return int((year//10)*10)

#add decade column
df["decade"] = df["year"].apply(lambda x: int(get_decade(x)))

## missing values

In [None]:
df.isnull().sum()

In [None]:
no_content = df[df["content"].isnull()]
print(f"#articles without content: {len(no_content)}, percentage: {len(no_content)/len(df)}")
no_content.head()

In [None]:
df = df[df["content"].notna()]
print("After deleting articles without content:")
df.isnull().sum()

In [None]:
Newsagencies = df[["newsagency", "uid"]].groupby("newsagency").count().rename(columns={"uid":"has_content"})
#join with no_content dataframe which is also grouped by newsagencies
Newsagencies = Newsagencies.join(no_content[["newsagency", "uid"]].groupby("newsagency").count().rename(columns={"uid":"no_content"}))
Newsagencies = Newsagencies.fillna(0)

#column: percentage of entries without content
Newsagencies["perc_no_content"] = Newsagencies.apply(lambda x: x.no_content/(x.has_content + x.no_content), axis=1)
Newsagencies["perc_no_content"].plot.bar()

In [None]:
Newsagencies

## Several Newsagency mentions per article

In [None]:
na_df = df[["uid", "newsagency"]]
na_df_grouped = na_df.groupby("uid")["newsagency"].apply(list)


In [None]:
len_unique = len(df.groupby("uid").count())
print(f"unique: {unique}, percentage of articles only contained in one collection: {unique/len(df)}")

## Distributions per decade

In [None]:
na_per_decade = pd.pivot_table(df, index="decade", columns="newsagency", values="uid", aggfunc="count")

In [None]:
na_per_decade.plot.line(figsize=(16,8), title="News Agency content per decade and agency")
plt.show()

In [None]:
na_per_decade.iloc[:16].plot.line(figsize=(16,8), title="News Agency content per decade and agency, 1750-1920")
plt.show()

In [None]:
na_per_decade.plot.bar(stacked=True, figsize=(16,8), title="News Agency content per decade")
plt.show()