## Newsguard description

First, let's load the files.
Let's start with one of them.


In [1]:
import os
import pandas as pd

In [2]:
folder_path = "../data/newsguard_biyearly/"
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

In [3]:
file=csv_files[0]
df = pd.read_csv(os.path.join(folder_path, file))
df.name=file

In [4]:
df.head(3)

Unnamed: 0,UUID,Domain,Parent Domain,Rating,Score,Last Updated,Country,Language,Does not repeatedly publish false content,Gathers and presents information responsibly,...,Twitter,YouTube,Instagram,Snapchat,LinkedIn,Pinterest,Main Point of Contact (name),Contact Phone Number,Contact Email,Contact Mail Form URL
0,35f341f9-d2c2-4d8f-9a00-910725e9647b,cbc.ca,,T,100.0,1532095397020,US,en,Yes,Yes,...,,,,,,,,,,
1,0c0e5e84-0b5c-47ff-bff4-94e987771e16,bitraped.com,naturalnews.com,N,12.5,1551286266563,US,en,No,No,...,,,,,,,,,,
2,3ee37011-ba1f-4435-93d3-c72b59972b92,vogue.com,,T,87.5,1532095401115,US,en,Yes,Yes,...,@voguemagazine,youtube.com/user/Americanvogue,@voguemagazine,@voguemagazine,,,,,,


In [5]:
df.columns

Index(['UUID', 'Domain', 'Parent Domain', 'Rating', 'Score', 'Last Updated',
       'Country', 'Language', 'Does not repeatedly publish false content',
       'Gathers and presents information responsibly',
       'Regularly corrects or clarifies errors',
       'Handles the difference between news and opinion responsibly',
       'Avoids deceptive headlines',
       'Website discloses ownership and financing',
       'Clearly labels advertising',
       'Reveals who's in charge, including any possible conflicts of interest',
       'The site provides names of content creators, along with either contact or biographical information',
       'Brand Name', 'Wikipedia', 'Type of Content', 'Medium',
       'Print Publications', 'Scope of Coverage', 'Designated Market Area',
       'Paywall', 'Opinion/Advocacy Journalism', 'Orientation', 'Owner',
       'Type of Owner', 'Facebook', 'Twitter', 'YouTube', 'Instagram',
       'Snapchat', 'LinkedIn', 'Pinterest', 'Main Point of Contact (name)',


In [6]:
df.columns.nunique()

40

In [7]:
df.Domain.nunique()

2622

In [8]:
df["Parent Domain"].nunique()

67

In [9]:
df.name

'metadata-2019030100.csv'

In [10]:
df_total = pd.DataFrame()
for csv in csv_files:
    df = pd.read_csv(os.path.join(folder_path, csv))
    df.name=csv
    print(df.shape)
    # let's add the date from the name of the file
    # taking name of the file, removing the extension and "metadata-"
    df["file_date"]=csv.split(".")[0].split("-")[1]
    # make file_date a datetime, with the format YYYYMMDD00
    df["file_date"]=pd.to_datetime(df["file_date"], format="%Y%m%d00")
    
    df_total = pd.concat([df_total, df], axis=0)

(2626, 40)
(7131, 46)
(6770, 44)
(4029, 40)
(7664, 46)
(4885, 41)
(9926, 47)
(9430, 46)
(8661, 46)


The number of columns changes over time: Which ones were removed or added?

In [11]:
# for all the files, compare the columns
columns_years = {}
for csv in csv_files:
    df=pd.read_csv(os.path.join(folder_path, csv))
    columns_years[csv.split(".")[0].split("-")[1]]=df.columns # saving a dict with the year/month as key and the columns as value to compare them
# substracting the columns common to all the files
always_present_columns = set.intersection(*map(set, columns_years.values()))
# removing the columns that are always present from the columns of each file
for year in columns_years.keys():
    columns_years[year]=list(set(columns_years[year])-always_present_columns)
# lets show the remaining columns over the years
for year in columns_years.keys():
    columns_years[year].sort()
    print(year, columns_years[year])

2019030100 ['Paywall', 'Pinterest', 'Scope of Coverage', 'Snapchat', 'Wikipedia', 'YouTube']
2021030100 ['Android App', 'Flags', 'Paywall', 'Pinterest', 'Process Information', 'Scope of Coverage', 'Snapchat', 'Topics', 'Wikipedia', 'Youtube', 'iOS App', 'myths']
2020090100 ['Flags', 'Paywall', 'Pinterest', 'Process Information', 'Scope of Coverage', 'Snapchat', 'Topics', 'Wikipedia', 'Youtube', 'myths']
2019090100 ['Paywall', 'Pinterest', 'Scope of Coverage', 'Snapchat', 'Wikipedia', 'YouTube']
2021090100 ['Android App', 'Flags', 'Paywall', 'Pinterest', 'Process Information', 'Scope of Coverage', 'Snapchat', 'Topics', 'Wikipedia', 'Youtube', 'iOS App', 'myths']
2020030100 ['Paywall', 'Pinterest', 'Scope of Coverage', 'Snapchat', 'Topics', 'Wikipedia', 'YouTube']
2023030100 ['Android App', 'BrandGuard Segments', 'Communities Served', 'Contact Mail Form URL.1', 'False Narratives', 'Flags', 'Original Reporting', 'Other', 'Process Information', 'Targeted Audience', 'Topics', 'Youtube', 'iO

In [12]:
len(always_present_columns)

34

In [13]:
df_total.columns

Index(['UUID', 'Domain', 'Parent Domain', 'Rating', 'Score', 'Last Updated',
       'Country', 'Language', 'Does not repeatedly publish false content',
       'Gathers and presents information responsibly',
       'Regularly corrects or clarifies errors',
       'Handles the difference between news and opinion responsibly',
       'Avoids deceptive headlines',
       'Website discloses ownership and financing',
       'Clearly labels advertising',
       'Reveals who's in charge, including any possible conflicts of interest',
       'The site provides names of content creators, along with either contact or biographical information',
       'Brand Name', 'Wikipedia', 'Type of Content', 'Medium',
       'Print Publications', 'Scope of Coverage', 'Designated Market Area',
       'Paywall', 'Opinion/Advocacy Journalism', 'Orientation', 'Owner',
       'Type of Owner', 'Facebook', 'Twitter', 'YouTube', 'Instagram',
       'Snapchat', 'LinkedIn', 'Pinterest', 'Main Point of Contact (name)',


In [14]:
df_total.head()

Unnamed: 0,UUID,Domain,Parent Domain,Rating,Score,Last Updated,Country,Language,Does not repeatedly publish false content,Gathers and presents information responsibly,...,iOS App,Android App,myths,Original Reporting,Targeted Audience,Contact Mail Form URL.1,Other,False Narratives,Communities Served,BrandGuard Segments
0,35f341f9-d2c2-4d8f-9a00-910725e9647b,cbc.ca,,T,100.0,1532095397020,US,en,Yes,Yes,...,,,,,,,,,,
1,0c0e5e84-0b5c-47ff-bff4-94e987771e16,bitraped.com,naturalnews.com,N,12.5,1551286266563,US,en,No,No,...,,,,,,,,,,
2,3ee37011-ba1f-4435-93d3-c72b59972b92,vogue.com,,T,87.5,1532095401115,US,en,Yes,Yes,...,,,,,,,,,,
3,33b3a070-62e2-42f5-b814-600784eb2402,foodscience.news,naturalnews.com,N,12.5,1551286316472,US,en,No,No,...,,,,,,,,,,
4,95d6acf5-a5cc-4cea-b34f-2431000b7391,stripes.com,,T,100.0,1532095405883,US,en,Yes,Yes,...,,,,,,,,,,


In [15]:
# save the dataframe to csv
df_total.to_csv("../data/newsguard_biyearly.csv", index=False)