# Preliminary look at subreddits for 6k news sources

In [2]:
!pip install zstandard

Collecting zstandard
  Downloading zstandard-0.16.0-cp39-cp39-win_amd64.whl (733 kB)
Installing collected packages: zstandard
Successfully installed zstandard-0.16.0


In [28]:
import json
import pandas as pd
import zstandard as zstd
import io

from collections import defaultdict, Counter
from urllib.parse import urlparse
import re
import datetime, time
import tldextract

In [29]:
print(datetime.datetime.now())
print(str(datetime.datetime.now())[11:19])

2022-02-10 00:02:29.862281
00:02:29


In [30]:
with open("D:\\Wellesley\\F21\\thesis\\data\\gm_intersection.json", "r") as infile:
    news_sources = json.load(infile)

In [31]:
len(news_sources)

42477

In [32]:
news_sources[:10]

['websterprogresstimes.com',
 'cordeledispatch.com',
 'k12.wv.us',
 'ukconstructionmedia.co.uk',
 'dylanpaulus.com',
 'arktimes.com',
 'asiafoodjournal.com',
 'corydontimes.com',
 'stuttgartdailyleader.com',
 'artrockermagazine.com']

## Using `gmm` instead

Since there are *A LOT* of news sources in `gm_intersect` which is the intersection of gdelt and muckrack, let's instead use `gmm_intersect` which is the intersection of gdelt, muckrack,and mbfc.

In [33]:
with open("D:\\Wellesley\\F21\\thesis\\data\\gmm_intersection.json", "r") as infile:
    gmm_news_sources = json.load(infile)

In [34]:
len(gmm_news_sources)

1631

## Open reddit data from April 2021

Reference: https://arxiv.org/pdf/2001.08435.pdf

Example of an entry of data:

```
{
    'all_awardings': [], 
    'allow_live_comments': False, 
    'archived': False, 
    'author': 'elanglohablante9805', 
    'author_created_utc': 1609519842, 
    'author_flair_background_color': '#ffb000', 
    'author_flair_css_class': None, 
    'author_flair_richtext': [], 
    'author_flair_template_id': '4f908eaa-9664-11ea-a567-0ed46a42aec3', 
    'author_flair_text': 'Historiador 📜 | 80-Day Streak 🔥', 
    'author_flair_text_color': 'dark', 
    'author_flair_type': 'text', 
    'author_fullname': 't2_9lr431i4', 
    'author_patreon_flair': False, 
    'author_premium': False, 
    'can_gild': True, 
    'category': None, 
    'content_categories': None, 
    'contest_mode': False, 
    'created_utc': 1617235201, 
    'discussion_type': None, 
    'distinguished': None, 
    'domain': 'self.WriteStreakES', 
    'edited': False, 
    'gilded': 0, 
    'gildings': {}, 
    'hidden': False, 
    'hide_score': False, 
    'id': 'mhj2hj', 
    'is_created_from_ads_ui': False, 
    'is_crosspostable': True, 
    'is_meta': False, 
    'is_original_content': False, 
    'is_reddit_media_domain': False, 
    'is_robot_indexable': True, 
    'is_self': True, 
    'is_video': False, 
    'link_flair_background_color': '', 
    'link_flair_css_class': None, 
    'link_flair_richtext': [], 
    'link_flair_text': None, 
    'link_flair_text_color': 'dark', 
    'link_flair_type': 'text', 
    'locked': False,
    'media': None, 
    'media_embed': {}, 
    'media_only': False, 
    'name': 't3_mhj2hj', 
    'no_follow': True, 
    'num_comments': 2, 
    'num_crossposts': 0, 
    'over_18': False, 
    'parent_whitelist_status': None, 
    'permalink': '/r/WriteStreakES/comments/mhj2hj/streak_90_ha_llegado_la_primavera/', 
    'pinned': False, 
    'pwls': None, 
    'quarantine': False, 
    'removed_by_category': None, 
    'retrieved_utc': 1623447663, 
    'score': 1, 
    'secure_media': None, 
    'secure_media_embed': {}, 
    'selftext': 'Los pájaros están cantando, las hierbas verdes están brotando, y tengo alergias.  Esto es la temporada de las alergias.  Estornudo cada mañana cuando me despierto, y otra vez si voy afuera.  Necesito tomar medicina cada día, pero no funciona tan bien. \n\nPor fuera, las lomas son bonitas porque son verdes y los robles tienen hojas nuevas.  Por el fin de semana,  hago caminatas pero cuando regreso a casa, necesito ducharme para remover el polen.\n\nCuando me jubile, voy a viajar al desierto cada año por toda la primavera.  No me gustaría quedarme aquí.', 
    'send_replies': True, 
    'spoiler': False, 
    'stickied': False, 
    'subreddit': 'WriteStreakES', 
    'subreddit_id': 't5_2eamt5', 
    'subreddit_subscribers': 2205, 
    'subreddit_type': 'public', 
    'suggested_sort': None, 
    'thumbnail': 'self', 
    'thumbnail_height': None, 
    'thumbnail_width': None, 
    'title': 'Streak 90: Ha llegado la primavera', 
    'top_awarded_type': None, 
    'total_awards_received': 0, 
    'treatment_tags': [], 
    'upvote_ratio': 1.0, 
    'url': 'https://www.reddit.com/r/WriteStreakES/comments/mhj2hj/streak_90_ha_llegado_la_primavera/', 
    'whitelist_status': None, 'wls': None}

```

In [35]:
dctx = zstd.ZstdDecompressor(max_window_size=2147483648)

In [36]:
def findURLs(phrase):
    regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)')
    url = re.findall(regex, phrase)     
    return [x[0] for x in url]

In [37]:
# try out
findURLs("does this find https://lol.com or nytimes.com/2021/10/19/us/politics/trump-border.html")

['https://lol.com']

In [40]:
def get_hostname(url, uri_type='both'):
    """Get the host name from the url"""
    # domain = re.compile(r"(https?://)?(www\.)?")
    # return domain.sub('', url).strip().strip('/').split('/')[0]
    hostnames = set()
    extracted = tldextract.extract(url)
    subdomain, domain, suffix = extracted
    # add both versions of domain.suffix and subdomain.domain.suffix
    full = ""
    # with subdomain
    if len(subdomain) > 0 and len(suffix) > 0:
        #print(f"{subdomain}.{domain}.{suffix}")
        full = f"{subdomain}.{domain}.{suffix}"
        if len(full) > 0:
            full = full[4:].strip('/') if full.startswith("www.") else full.strip('/')
            # if full in gmm_news_sources: # ******* gmm_news_sources_ added here *******
            #     return full
            hostnames.add(full[4:].strip('/')) if full.startswith("www.") else hostnames.add(full.strip('/'))
            # hostnames.add(full.replace("www.","").strip('/'))
    # without subdomain
    full = f"{domain}.{suffix}"
    if len(full) > 0 and len(suffix) > 0:
        full = full[4:].strip('/') if full.startswith("www.") else full.strip('/')
        # if full in gmm_news_sources: # ******* gmm_news_sources_ added here *******
        #     return full
    # return ""
        hostnames.add(full[4:].strip('/')) if full.startswith("www.") else hostnames.add(full.strip('/'))
        # hostnames.add(full.replace("www.","").strip('/'))
    return hostnames

In [41]:
# function try out
print(get_hostname("https://www.nytimes.com"))
print(get_hostname("http://www.aiaia.nytimes.com/add"))
print(get_hostname("www.nytimes.com/additional"))

{'nytimes.com'}
{'nytimes.com', 'aiaia.nytimes.com'}
{'nytimes.com'}


In [42]:
"realtor.com" in news_sources

True

In [43]:
zst_files = ["RS_2021-01.zst", "RS_2021-02.zst", "RS_2021-03.zst", "RS_2021-04.zst", "RS_2021-05.zst", "RS_2021-06.zst"]
# zst_files = ["RS_2021-05.zst", "RS_2021-06.zst"]
zst_filepath = "E:/thesis_data/" # D for ThinkPad

In [57]:
subreddit_srid = dict()

posts_with_urls = list()
posts_with_urls

[]

In [58]:
import itertools
x = [[], ['foo'], ['bar', 'baz'], ['quux'], ("tup_1", "tup_2"), {1:"one", 2:"two"}]
print(list(itertools.chain(*x)))
# print([element for sub in x for element in sub])

['foo', 'bar', 'baz', 'quux', 'tup_1', 'tup_2', 1, 2]


In [None]:
print("start time:", datetime.datetime.now())

counter = 0
for zst_file in zst_files[1:]:
    ns_subreddit = defaultdict(Counter) # counting how many time a news source appears in each subreddit
    subreddit_ns = defaultdict(Counter)
    print("***** Start processing for {} *****".format(zst_file))
    with open(zst_filepath+zst_file, 'rb') as ifh: #, open("stream_output.json", 'wb') as ofh:
        with dctx.stream_reader(ifh, read_size=2) as reader:
            text_stream = io.TextIOWrapper(reader, encoding='utf-8')
            url_of_our_ns = False 
            # ^this is to keep track if this post has ns url that we care about.
            # if so, add to posts_with_urls once.
            for d in text_stream:
                line = json.loads(d)
                subreddit, subreddit_id = line['subreddit'], line['subreddit_id']
                if subreddit not in subreddit_srid:
                    subreddit_srid[subreddit] = subreddit_id
                URLs = findURLs(line['url']) + findURLs(line['selftext'])                
                hostnames = [get_hostname(url) for url in URLs]
                URLs = Counter([element for sub in hostnames for element in sub])
                # print(URLs)
                # URLs = itertools.chain(*hostnames)
                # if len(URLs) > 10: print(line['selftext'])
                for url in URLs:
                    if url in gmm_news_sources: # instead of the full  news_sources
                        if not url_of_our_ns:
                            # posts_with_urls.append(line)
                            url_of_our_ns = True
                        ns_subreddit[url][subreddit] += URLs[url] # 1
                        subreddit_ns[subreddit][url] += URLs[url] # 1
                        # break
                        # print(f"ns_subreddit: {ns_subreddit}")
                url_of_our_ns = False
                counter += 1
                if counter%500000 == 0: 
                    print("processed {} by {}".format(counter, str(datetime.datetime.now())[11:19]))
                
    
    print("-------------------------------- Done reading, will write files now --------------------------------")
    
    # write into files separated by months
    with open("ns_subreddit_{}.json".format(zst_file[3:10]), "w", encoding="utf-8") as outfile:
        json.dump(ns_subreddit, outfile, indent=4)
        
    with open("subreddit_ns_{}.json".format(zst_file[3:10]), "w", encoding = "utf-8") as outfile1:
        json.dump(subreddit_ns, outfile1, indent=4)
        
    with open ("subreddit_srid_{}.json".format(zst_file[3:10]), "w", encoding = "utf-8") as infile_srid:
        json.dump(subreddit_srid, infile_srid, indent=4)
        
    # with open("E:\\thesis_data\\posts_with_urls_{}.json".format(zst_file[3:10]), "w", encoding = "utf-8") as outfile2:
        # json.dump(posts_with_urls, outfile2, indent=4)
        
    # ns_subreddit = defaultdict(Counter) # counting how many time a news source appears in each subreddit
    # subreddit_ns = defaultdict(Counter)
    # subreddit_srid = dict()
    posts_with_urls = list()
    counter = 0
        
    print("----------------------------------------------------------------------------------------")
    print("-------------------------------- Done processing for {} --------------------------------".format(zst_file))
    print("----------------------------------------------------------------------------------------")
                
print("finish time:", datetime.datetime.now())

start time: 2022-02-10 00:08:52.617949
***** Start processing for RS_2021-02.zst *****
processed 500000 by 00:10:15
processed 1000000 by 00:11:32
processed 1500000 by 00:12:48
processed 2000000 by 00:14:08
processed 2500000 by 00:15:28
processed 3000000 by 00:16:39
processed 3500000 by 00:17:49
processed 4000000 by 00:19:00
processed 4500000 by 00:20:09
processed 5000000 by 00:21:19
processed 5500000 by 00:22:31
processed 6000000 by 00:23:43
processed 6500000 by 00:24:51
processed 7000000 by 00:26:04
processed 7500000 by 00:27:17
processed 8000000 by 00:28:32
processed 8500000 by 00:29:44
processed 9000000 by 00:30:58
processed 9500000 by 00:32:11
processed 10000000 by 00:33:22
processed 10500000 by 00:34:38
processed 11000000 by 00:35:50
processed 11500000 by 00:37:07
processed 12000000 by 00:38:21
processed 12500000 by 00:39:33
processed 13000000 by 00:40:47
processed 13500000 by 00:42:01
processed 14000000 by 00:43:10
processed 14500000 by 00:44:19
processed 15000000 by 00:45:25
pro

In [None]:
"tunein.com" in news_sources

In [19]:
# with open ("subreddit_srid_{}.json".format(zst_file[3:10]), "w", encoding = "utf-8") as infile_srid:
#     json.dump(subreddit_srid, infile_srid)
        
# with open("D:/thesis_data/posts_with_urls_{}.json".format(zst_file[3:10]), "w", encoding = "utf-8") as outfile2:
#     json.dump(posts_with_urls, outfile2)

In [18]:
zst_file[3:10]

'2021-03'

In [17]:
counter

0

Number of posts read from 
1. January 2021:
2. February 2021: 31,161,912
3. March 2021: 33,0061,03
4. April 2021:
5. May 2021: >36M
6. June 2021: >34M

In [24]:
with open ("subreddit_srid_{}.json".format("2021-04"), "w", encoding = "utf-8") as infile_srid:
    json.dump(subreddit_srid, infile_srid)

In [21]:
len(posts_with_urls)

588719

In [112]:
len(ns_subreddit)

3686

3686 news sources

In [114]:
len(subreddit_ns)

20755

20755 subreddits

In [115]:
len(posts_with_urls)

811504

In [117]:
with open("ns_subreddit.json", "w", encoding="utf-8") as outfile:
    json.dump(ns_subreddit, outfile)

In [118]:
with open("subreddit_ns.json", "w", encoding = "utf-8") as outfile1:
    json.dump(subreddit_ns, outfile1)

In [119]:
with open("posts_with_urls", "w", encoding = "utf-8") as outfile2:
    json.dump(posts_with_urls, outfile2)

In [120]:
counter

31616206

with open("counter_april21.json", "w", encoding = "utf-8") as counterfile:
    json.dump(counter, counterfile)

**Getting subreddit names and ids**

In [8]:
subreddit_id = defaultdict(set)
id_subreddit = defaultdict(set)

In [13]:
print("start time:", datetime.datetime.now())

counter = 0
with open("D://Wellesley/F21/thesis_zst_data/RS_2021-04.zst", 'rb') as ifh: #, open("stream_output.json", 'wb') as ofh:
    with dctx.stream_reader(ifh, read_size=2) as reader:
        text_stream = io.TextIOWrapper(reader, encoding='utf-8')
        for d in text_stream:
            line = json.loads(d)
            sr, sr_id = line['subreddit'], line['subreddit_id']
            subreddit_id[sr].add(sr_id)
            id_subreddit[sr_id].add(sr)
#             URLs = findURLs(line['url']) + findURLs(line['selftext'])
#             URLs = [get_hostname(url) for url in URLs]
#             # print("URLs:", URLs)
#             # if len(URLs) > 10: print(line['selftext'])
#             for url in URLs:
#                 if url in news_sources:
#                     posts_with_urls.append(line)
#                     ns_subreddit[url][subreddit] += 1
#                     subreddit_ns[subreddit][url] += 1
            counter += 1
            if counter%500000 == 0: print(f"at {counter}")
                
print("finish time:", datetime.datetime.now())

start time: 2021-11-05 01:59:23.609969
at 500000
at 1000000
at 1500000
at 2000000
at 2500000
at 3000000
at 3500000
at 4000000
at 4500000
at 5000000
at 5500000
at 6000000
at 6500000
at 7000000
at 7500000
at 8000000
at 8500000
at 9000000
at 9500000
at 10000000
at 10500000
at 11000000
at 11500000
at 12000000
at 12500000
at 13000000
at 13500000
at 14000000
at 14500000
at 15000000
at 15500000
at 16000000
at 16500000
at 17000000
at 17500000
at 18000000
at 18500000
at 19000000
at 19500000
at 20000000
at 20500000
at 21000000
at 21500000
at 22000000
at 22500000
at 23000000
at 23500000
at 24000000
at 24500000
at 25000000
at 25500000
at 26000000
at 26500000
at 27000000
at 27500000
at 28000000
at 28500000
at 29000000
at 29500000
at 30000000
at 30500000
at 31000000
at 31500000
finish time: 2021-11-05 03:35:19.988106


In [14]:
len(subreddit_id)

639811

In [15]:
len(id_subreddit)

639811

In [17]:
for s in subreddit_id:
    if len(subreddit_id[s]) != 1:
        print(f"{s} is invalid, length {len(subreddit_id[s])}")

In [18]:
for s in id_subreddit:
    if len(id_subreddit[s]) != 1:
        print(f"{s} is invalid, length {len(id_subreddit[s])}")

Let's make all values to be strings.

In [20]:
for s in subreddit_id:
    subreddit_id[s] = list(subreddit_id[s])[0]

In [21]:
for i in id_subreddit:
    id_subreddit[i] = list(id_subreddit[i])[0]

There are the same number of `id`s and `subreddit`s. Good!

In [22]:
with open("subreddit_id.json", "w", encoding = "utf-8") as outfile_si:
    json.dump(subreddit_id, outfile_si)

In [23]:
with open("id_subreddit.json", "w", encoding = "utf-8") as outfile_is:
    json.dump(id_subreddit, outfile_is)