# Creating sets of labelled news sources

I will create labelled news sources based on reliability, political leaning, media type, and geographical location.

In [1]:
import json
import csv
import os

In [26]:
import tldextract

In [43]:
from collections import Counter, defaultdict

## Reliability

Using News Guard

In [3]:
newsguard = json.load(open("new_newsguard_rated.json", "r", encoding="utf-8"))

In [6]:
reliability = dict()
for ns,rate in newsguard:
    if rate <= 60.0:
        reliability[ns] = "unreliable"
    else:
        reliability[ns] = "reliable"

In [8]:
json.dump(reliability, open("reliability.json", "w", encoding="utf-8"))

## Political Leaning

Using Media Bias Fact Check

In [9]:
pd1 = os.path.split(os.getcwd())[0]
mbfc_path = os.path.join(pd1, "data/mbfc")

In [10]:
lf = json.load(open(os.path.join(mbfc_path, "mbfc_left.json")))
lc = json.load(open(os.path.join(mbfc_path, "mbfc_leftcenter.json")))

rh = json.load(open(os.path.join(mbfc_path, "mbfc_right.json")))
rc = json.load(open(os.path.join(mbfc_path, "mbfc_right-center.json")))

In [11]:
ns_fixed_ = json.load(open("ns_fixed_.json", "r", encoding="utf-8"))

In [12]:
left = list(set(ns_fixed_) & (set([k[1] for k in lf]) | set([k[1] for k in lc])))
len(left)

158

In [13]:
right = (set(ns_fixed_) & (set([k[1] for k in rh]) | set([k[1] for k in rc])))
len(right)

84

In [15]:
political_leaning = dict()

for l in left:
    political_leaning[l] = "left"

for r in right:
    political_leaning[r] = "right"

In [17]:
json.dump(political_leaning, open("political_leaning.json", "w", encoding="utf-8"))

## Media Type

In [18]:
muckrack_path = os.path.join(pd1, "data/muckrack")

In [28]:
new_muckrack = json.load(open(os.path.join(muckrack_path, "all_new_muckrack.json")))
len(new_muckrack)

594687

In [23]:
old_muckrack = json.load(open(os.path.join(muckrack_path, "all_old_muckrack.json")))

In [33]:
# let us clean both new and old muckrack
clean_new_muckrack = dict()

for nsm in new_muckrack:
    extracted = tldextract.extract(nsm)
    subdomain, domain, suffix = extracted
    # add both versions of domain.suffix and subdomain.domain.suffix
    full = ""
    # with subdomain
    if len(subdomain) > 0:
        #print(f"{subdomain}.{domain}.{suffix}")
        full = f"{subdomain}.{domain}.{suffix}"
        if len(full) > 0:
            clean_new_muckrack[full.replace("www.","").strip('/')] = new_muckrack[nsm]
    # without subdomain
    full = f"{domain}.{suffix}"
    if len(full) > 0:
        clean_new_muckrack[full.replace("www.","").strip('/')] = new_muckrack[nsm]
len(clean_new_muckrack)

450591

In [34]:
clean_new_muckrack["tribunnews.com"]

[{'source_name': 'Banjarmasin Post',
  'location': 'Banjarmasin, Kalimantan, ID',
  'media_type': 'Newspaper'}]

In [35]:
len(set(clean_new_muckrack.keys()) & set(ns_fixed_))

2464

In [36]:
len(ns_fixed_)

2639

### According to new muck rack

In [40]:
media_type = dict()

for k in ns_fixed_:
    media_type[k] = clean_new_muckrack[k][0]['media_type'] if k in clean_new_muckrack else ''

In [41]:
media_type

{'crimeonline.com': 'Podcast',
 'cornell.edu': 'Podcast',
 'wealthsimple.com': '',
 'newscientist.com': 'Podcast',
 'insider.com': 'Online/Digital',
 'kcci.com': 'Television',
 'hackernoon.com': '',
 'thebetterindia.com': 'Online/Digital',
 'wcpo.com': 'Podcast',
 'theculturetrip.com': '',
 'king5.com': 'Television',
 'runnersworld.com': 'Magazine',
 'crimereads.com': '',
 'businessinsider.com': 'Online/Digital',
 'futurism.com': 'Online/Digital',
 'moneymorning.com': 'Online/Digital',
 'applytojob.com': '',
 'montgomeryadvertiser.com': 'Newspaper',
 'mlbtraderumors.com': 'Online/Digital',
 'gizchina.com': '',
 'journaltimes.com': 'Newspaper',
 'thegrio.com': 'Online/Digital',
 'sfgate.com': 'Newsletter (Digital)',
 'motortrend.com': 'Magazine',
 'thehindu.com': '',
 'uber.com': 'Blog',
 'lasvegassun.com': 'Newspaper',
 'metro.co.uk': 'Newsletter (Digital)',
 'musicradar.com': 'Magazine',
 'californiaglobe.com': '',
 'nsf.gov': 'Podcast',
 'cnn.com': 'Newsletter (Digital)',
 'americanp

In [42]:
len(set(media_type.values()))

23

In [44]:
Counter(media_type.values())

Counter({'Podcast': 626,
         '': 563,
         'Online/Digital': 538,
         'Television': 317,
         'Magazine': 162,
         'Newspaper': 270,
         'Newsletter (Digital)': 26,
         'Blog': 46,
         'Journal': 16,
         'Newsletter (Print)': 4,
         'Radio Program': 4,
         'Radio': 15,
         'Television Program': 13,
         'Newswire/News Agency': 19,
         'Media Company': 4,
         'Non-profit': 3,
         'Corporate Newsroom': 1,
         'OTT/Streaming': 2,
         'Press release/News aggregator': 1,
         'Broadcaster': 2,
         'Podcast Network': 2,
         'Research Company/Group': 4,
         'Picture Agency': 1})

In [46]:
json.dump(media_type, open("media_type.json", "w", encoding="utf-8"))

### According to old muck rack


In [47]:
clean_old_muckrack = dict()

for nsm in old_muckrack:
    extracted = tldextract.extract(nsm)
    subdomain, domain, suffix = extracted
    # add both versions of domain.suffix and subdomain.domain.suffix
    full = ""
    # with subdomain
    if len(subdomain) > 0:
        #print(f"{subdomain}.{domain}.{suffix}")
        full = f"{subdomain}.{domain}.{suffix}"
        if len(full) > 0:
            clean_old_muckrack[full.replace("www.","").strip('/')] = old_muckrack[nsm]
    # without subdomain
    full = f"{domain}.{suffix}"
    if len(full) > 0:
        clean_old_muckrack[full.replace("www.","").strip('/')] = old_muckrack[nsm]
len(clean_old_muckrack)

67782

In [50]:
media_type_current = dict()

for k in ns_fixed_:
    media_type_current[k] = clean_old_muckrack[k][0]['instanceof'] if k in clean_old_muckrack else ''

In [51]:
media_type_current

{'crimeonline.com': 'Online/Digital',
 'cornell.edu': 'Podcast',
 'wealthsimple.com': 'Online/Digital',
 'newscientist.com': 'Podcast',
 'insider.com': 'Online/Digital',
 'kcci.com': 'Television',
 'hackernoon.com': 'Blog',
 'thebetterindia.com': 'Online/Digital',
 'wcpo.com': 'Television',
 'theculturetrip.com': 'Online/Digital',
 'king5.com': 'Television',
 'runnersworld.com': 'Magazine',
 'crimereads.com': 'Online/Digital',
 'businessinsider.com': 'Financial/Market news, Online/Digital',
 'futurism.com': 'Online/Digital',
 'moneymorning.com': 'Online/Digital',
 'applytojob.com': '',
 'montgomeryadvertiser.com': 'Newspaper',
 'mlbtraderumors.com': 'Online/Digital',
 'gizchina.com': 'Media Company, Online/Digital',
 'journaltimes.com': 'Newspaper',
 'thegrio.com': 'Online/Digital',
 'sfgate.com': 'Online/Digital',
 'motortrend.com': 'Podcast, Television Program',
 'thehindu.com': 'Online/Digital',
 'uber.com': 'Blog',
 'lasvegassun.com': 'Newspaper',
 'metro.co.uk': 'Newspaper',
 'mus

In [52]:
json.dump(media_type_current, open("media_type_current.json", "w", encoding="utf-8"))

## Country

In [54]:
country = dict()

for k in ns_fixed_:
    country[k] = clean_old_muckrack[k][0]['country'] if k in clean_old_muckrack else ''

In [55]:
Counter(country.values())

Counter({'': 2639})