## Newsguard file exploration and merge

First, let's load the files.
Let's start with one of them.


In [1]:
import os
import pandas as pd

In [2]:
folder_path = "/data/NewsGuard/full-metadata"
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

In [3]:
file=csv_files[0]
df = pd.read_csv(os.path.join(folder_path, file))
df.name=file

In [4]:
df.head(3)

Unnamed: 0,UUID,Domain,Parent Domain,Rating,Score,Last Updated,Country,Language,Does not repeatedly publish false content,Gathers and presents information responsibly,...,Facebook,Twitter,Youtube,Instagram,Snapchat,LinkedIn,Pinterest,iOS App,Android App,myths
0,512f05a7-0ce8-4c4a-b3f3-983d008cb008,cleveland.com,,T,92.5,1662082072620,US,en,Yes,Yes,...,https://www.facebook.com/clevelandcom/,https://twitter.com/clevelanddotcom?ref_src=tw...,https://www.youtube.com/channel/UCJkc1COQO0WiZ...,https://www.instagram.com/clevelanddotcom/?hl=en,,https://www.linkedin.com/company/cleveland-com/,https://www.pinterest.com/clevelanddotcom/,https://apps.apple.com/us/app/cleveland-com/id...,https://play.google.com/store/apps/details?id=...,
1,3ba8f2f4-3282-4e28-b59e-bf76511e1454,wctrib.com,,T,100.0,1662079570072,US,en,Yes,Yes,...,https://www.facebook.com/pg/westcentraltribune...,https://twitter.com/wctrib,https://www.youtube.com/channel/UC4Xi9zt3WlBv4...,https://www.instagram.com/wctrib/,,https://www.linkedin.com/company/west-central-...,,https://apps.apple.com/us/app/west-central-tri...,https://play.google.com/store/apps/details?id=...,
2,f3ac5288-5dcd-4e49-8063-89331fab2c47,capegazette.com,,T,100.0,1662075996485,US,en,Yes,Yes,...,https://www.facebook.com/CapeGazette/,https://twitter.com/CapeGazette,https://www.youtube.com/channel/UC3H52GV5a5imE...,https://www.instagram.com/capegazette,,https://www.linkedin.com/company/cape-gazette/,https://www.pinterest.com/capegazette/,https://apps.apple.com/us/app/cape-gazette-eed...,https://apkpure.com/cape-gazette-eedition/com....,


In [5]:
df.columns

Index(['UUID', 'Domain', 'Parent Domain', 'Rating', 'Score', 'Last Updated',
       'Country', 'Language', 'Does not repeatedly publish false content',
       'Gathers and presents information responsibly',
       'Regularly corrects or clarifies errors',
       'Handles the difference between news and opinion responsibly',
       'Avoids deceptive headlines',
       'Website discloses ownership and financing',
       'Clearly labels advertising',
       'Reveals who's in charge, including any possible conflicts of interest',
       'The site provides names of content creators, along with either contact or biographical information',
       'Brand Name', 'Wikipedia', 'Topics', 'Type of Content', 'Medium',
       'Print Publications', 'Scope of Coverage', 'Designated Market Area',
       'Paywall', 'Opinion/Advocacy Journalism', 'Orientation', 'Owner',
       'Type of Owner', 'Flags', 'Process Information',
       'Main Point of Contact (name)', 'Contact Phone Number', 'Contact Email',
 

In [6]:
df.columns.nunique()

46

In [7]:
df.Domain.nunique()

8234

In [8]:
df["Parent Domain"].nunique()

317

In [9]:
df.name

'metadata-2022090201.csv'

In [12]:
df_total = pd.DataFrame()
for csv in csv_files:
    df = pd.read_csv(os.path.join(folder_path, csv))
    df.name=csv
    print(df.shape)
    # let's add the date from the name of the file
    # taking name of the file, removing the extension and "metadata-"
    df["file_date"]=csv.split(".")[0].split("-")[1]
    # make file_date a datetime, with the format YYYYMMDD00
    df["file_date"]=pd.to_datetime(df["file_date"], format="%Y%m%d%H")
    
    df_total = pd.concat([df_total, df], axis=0)

(9431, 46)
(2628, 40)
(9318, 46)
(2698, 40)
(9577, 47)
(2836, 40)
(9721, 47)
(3328, 40)
(10029, 47)
(3535, 40)
(10178, 47)
(3813, 40)
(10230, 47)
(4029, 40)
(10432, 47)
(4204, 41)
(10544, 47)
(4375, 41)
(10898, 47)
(4472, 41)
(11058, 47)
(4664, 41)
(11132, 47)
(4825, 41)
(4885, 41)
(4982, 41)
(5075, 41)
(5113, 41)
(5350, 41)
(5651, 43)
(6772, 44)
(6839, 44)
(6921, 44)
(6877, 44)
(7044, 46)
(7085, 46)
(7132, 46)
(7200, 46)
(7258, 46)
(7324, 46)
(7431, 46)
(7488, 46)
(7662, 46)
(7838, 46)
(8152, 46)
(8211, 46)
(8239, 46)
(8569, 46)
(8712, 46)
(8845, 46)
(5232, 46)
(8929, 46)
(9057, 46)
(9793, 44)
(9852, 47)
(9894, 47)
(9928, 47)


The number of columns changes over time: Which ones were removed or added?

In [13]:
# for all the files, compare the columns
columns_years = {}
for csv in csv_files:
    df=pd.read_csv(os.path.join(folder_path, csv))
    columns_years[csv.split(".")[0].split("-")[1]]=df.columns # saving a dict with the year/month as key and the columns as value to compare them
# substracting the columns common to all the files
always_present_columns = set.intersection(*map(set, columns_years.values()))
# removing the columns that are always present from the columns of each file
for year in columns_years.keys():
    columns_years[year]=list(set(columns_years[year])-always_present_columns)
# lets show the remaining columns over the years
for year in columns_years.keys():
    columns_years[year].sort()
    print(year, columns_years[year])

2022090201 ['Android App', 'Flags', 'Paywall', 'Pinterest', 'Process Information', 'Scope of Coverage', 'Snapchat', 'Topics', 'Wikipedia', 'Youtube', 'iOS App', 'myths']
2019030201 ['Paywall', 'Pinterest', 'Scope of Coverage', 'Snapchat', 'Wikipedia', 'YouTube']
2022080201 ['Android App', 'Flags', 'Paywall', 'Pinterest', 'Process Information', 'Scope of Coverage', 'Snapchat', 'Topics', 'Wikipedia', 'Youtube', 'iOS App', 'myths']
2019040201 ['Paywall', 'Pinterest', 'Scope of Coverage', 'Snapchat', 'Wikipedia', 'YouTube']
2022100201 ['Android App', 'Flags', 'Other', 'Paywall', 'Pinterest', 'Process Information', 'Scope of Coverage', 'Snapchat', 'Topics', 'Wikipedia', 'Youtube', 'iOS App', 'myths']
2019050201 ['Paywall', 'Pinterest', 'Scope of Coverage', 'Snapchat', 'Wikipedia', 'YouTube']
2022110201 ['Android App', 'Flags', 'Other', 'Paywall', 'Pinterest', 'Process Information', 'Scope of Coverage', 'Snapchat', 'Topics', 'Wikipedia', 'Youtube', 'iOS App', 'myths']
2019060201 ['Paywall', 

In [14]:
len(always_present_columns)

34

In [15]:
df_total.columns

Index(['UUID', 'Domain', 'Parent Domain', 'Rating', 'Score', 'Last Updated',
       'Country', 'Language', 'Does not repeatedly publish false content',
       'Gathers and presents information responsibly',
       'Regularly corrects or clarifies errors',
       'Handles the difference between news and opinion responsibly',
       'Avoids deceptive headlines',
       'Website discloses ownership and financing',
       'Clearly labels advertising',
       'Reveals who's in charge, including any possible conflicts of interest',
       'The site provides names of content creators, along with either contact or biographical information',
       'Brand Name', 'Wikipedia', 'Topics', 'Type of Content', 'Medium',
       'Print Publications', 'Scope of Coverage', 'Designated Market Area',
       'Paywall', 'Opinion/Advocacy Journalism', 'Orientation', 'Owner',
       'Type of Owner', 'Flags', 'Process Information',
       'Main Point of Contact (name)', 'Contact Phone Number', 'Contact Email',
 

In [16]:
df_total.head()

Unnamed: 0,UUID,Domain,Parent Domain,Rating,Score,Last Updated,Country,Language,Does not repeatedly publish false content,Gathers and presents information responsibly,...,file_date,YouTube,Other,Original Reporting,Targeted Audience,Contact Mail Form URL.1,False Narratives,Communities Served,BrandGuard Segments,OTHER
0,512f05a7-0ce8-4c4a-b3f3-983d008cb008,cleveland.com,,T,92.5,1662082072620,US,en,Yes,Yes,...,2022-09-02 01:00:00,,,,,,,,,
1,3ba8f2f4-3282-4e28-b59e-bf76511e1454,wctrib.com,,T,100.0,1662079570072,US,en,Yes,Yes,...,2022-09-02 01:00:00,,,,,,,,,
2,f3ac5288-5dcd-4e49-8063-89331fab2c47,capegazette.com,,T,100.0,1662075996485,US,en,Yes,Yes,...,2022-09-02 01:00:00,,,,,,,,,
3,14a2dd02-8394-44f8-be2a-5173f9d72823,thenewstack.io,,T,92.5,1662070911483,US,en,Yes,Yes,...,2022-09-02 01:00:00,,,,,,,,,
4,e583654c-2acb-4a77-9b66-6aacab382dd6,dailysceptic.org,,N,37.5,1662070807918,GB,en,No,No,...,2022-09-02 01:00:00,,,,,,,,,


In [17]:
df_total["Domain"].nunique()

10337

In [19]:
# save the dataframe to csv
file="newsguard_merged.csv.gz"
df_total.to_csv(os.path.join(folder_path, file), compression="gzip", index=False)