In [1]:
import pandas as pd
from os.path import join
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [2]:
DIR = "/data/NewsGuard/"
TABLES = "/home/jluehring/newsguard/newsguard-review-paper/tables/"

In [3]:
# sampled only from the middle of the month
df_15 = pd.read_csv(join(DIR, "newsguard_mid_month.csv"),
                    parse_dates=["file_date"],
                    low_memory=False)

In [4]:
gond = pd.read_csv("../GONDv1_domains.csv")
print(gond["domain"].nunique())

1147


In [5]:
gond["type"]\
    .value_counts(
        normalize=False)

type
legacy press                561
digital-born news outlet    206
commercial broadcaster      116
hyperpartisan news           94
public broadcaster           92
tabloid newspaper            78
Name: count, dtype: int64

In [6]:
df15_unique = \
    df_15\
    .drop_duplicates(
        subset="Domain", 
        keep="first")

gond_unqiue = \
    gond\
    .drop_duplicates(
        subset="domain", 
        keep="first")

overlap = pd.merge(gond_unqiue, df15_unique, 
                   left_on="domain", 
                   right_on="Domain",
                   how="inner")

len(overlap)

552

In [7]:
#keep only the columns of interest
overlap_interest = \
    overlap[["domain", "type", "language", 
             "Domain", "Country", "Language", "Score", "Rating"]]

#rename into Type
overlap_interest["GOND Type"] = overlap_interest["type"] 
overlap_interest = overlap_interest.drop(columns="type")

In [8]:
overlap_type_score = \
    overlap_interest\
        .groupby("GOND Type")\
        ["Score"]\
        .mean()\
        .round(1)\
        .reset_index(name="NewsGuard Score")

overlap_type_n = \
    overlap_interest\
        .groupby("GOND Type")\
        .size()\
        .reset_index(name="Overlap")

overlap_type_perc = overlap_interest["GOND Type"]\
    .value_counts(normalize=True)\
    .mul(100)\
    .round(1)\
    .reset_index()

overlap_type_sum = \
    overlap_type_score\
        .merge(overlap_type_n,
                on="GOND Type")\
        .merge(overlap_type_perc,
                on="GOND Type")

overlap_type_sum

Unnamed: 0,GOND Type,NewsGuard Score,Overlap,proportion
0,commercial broadcaster,89.4,58,10.5
1,digital-born news outlet,84.5,66,12.0
2,hyperpartisan news,43.0,56,10.1
3,legacy press,91.0,292,52.9
4,public broadcaster,89.0,40,7.2
5,tabloid newspaper,84.6,40,7.2


In [9]:
#save as latex table
#rename column to percentage
overlap_type_sum = overlap_type_sum\
    .rename(columns={"proportion": "\%"})

overlap_type_sum_latex = overlap_type_sum\
    .to_latex(
        index=False, 
        escape=False,
        caption="Overlap between GOND and NewsGuard.",
        label="tab:overlap_lists", 
        position='H',
        column_format=' X c c c ',
        longtable=False, 
        header=True, 
        na_rep='---', 
        float_format="{:.1f}".format,
        bold_rows=True)

overlap_type_sum_latex = \
    overlap_type_sum_latex\
        .replace("\\begin{tabular}{", "\\begin{tabularx}{\\textwidth}{")
overlap_type_sum_latex = \
    overlap_type_sum_latex\
        .replace("\\end{tabular}", "\\end{tabularx}")

overlap_type_sum_latex = \
    overlap_type_sum_latex\
        .replace("\\bottomrule", 
                 "\\bottomrule\n" 
                 "\\multicolumn{3}{l}{\\textit{Note.} As of July 15th, 2024.}"
                 )

with open(join(TABLES, "overlap_lists.tex"), 'w') as f:
    f.write(overlap_type_sum_latex)

  .rename(columns={"proportion": "\%"})


In [10]:
gond_de = (gond[gond["language"] == "de"]
           .drop_duplicates(
               subset="domain", 
               keep="first")
)
len(gond_de)

573

In [11]:
#overlap between gond and newsguard
df15_de = (df_15[df_15["Language"] == "de"]
           .drop_duplicates(
               subset="Domain", 
               keep="first")
)
overlap_de = pd.merge(gond_de, df15_de, 
                   left_on="domain", 
                   right_on="Domain",
                   how="inner")
len(overlap_de)

244

In [12]:
overlap_de_type = overlap_de\
    .groupby("type")\
    .size()\
    .reset_index(name="Count")

overlap_de_type

Unnamed: 0,type,Count
0,commercial broadcaster,4
1,digital-born news outlet,20
2,hyperpartisan news,24
3,legacy press,163
4,public broadcaster,18
5,tabloid newspaper,15


In [13]:
#per type of source, what is the overlap?
overlap\
    .groupby("type")\
    .size()\
    .reset_index(name="Count")

Unnamed: 0,type,Count
0,commercial broadcaster,58
1,digital-born news outlet,66
2,hyperpartisan news,56
3,legacy press,292
4,public broadcaster,40
5,tabloid newspaper,40


In [14]:
#what are the sources in NG that dont overlap?
df15_de_unique = df15_de[~df15_de["Domain"].isin(gond_de["domain"])]
#df15_de_unique["Domain"].to_list()

In [15]:
gond_de_unique = gond_de[~gond_de["domain"].isin(df15_de["Domain"])]
#gond_de_unique["domain"].to_list()