# Get all relevant submissions

Author: Junita Sirait

I am tired of parsing the whole 6 months of data each time I need additional informations from the JSON files. Here I will master the art of `ndjson` dump and load, so I can just process these files in the future.

Table of contents:
1. [Reading the original massive files](#sub1)

In [1]:
# !pip install ndjson

Collecting ndjson
  Downloading ndjson-0.3.1-py2.py3-none-any.whl (5.3 kB)
Installing collected packages: ndjson
Successfully installed ndjson-0.3.1


You should consider upgrading via the 'C:\Users\User200803\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [3]:
import ndjson

import json
import pandas as pd
import zstandard as zstd
import io

from collections import defaultdict, Counter
from urllib.parse import urlparse
import re
import datetime, time
import tldextract

## Reading the original massive files

In [4]:
with open("D:\\Wellesley\\F21\\thesis\\data\\gmm_intersection.json", "r") as infile:
    gmm_news_sources = json.load(infile)

In [5]:
dctx = zstd.ZstdDecompressor(max_window_size=2147483648)

In [6]:
def findURLs(phrase):
    regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)')
    url = re.findall(regex, phrase)     
    return [x[0] for x in url]

In [7]:
def get_hostname(url, uri_type='both'):
    """Get the host name from the url"""
    hostnames = set()
    extracted = tldextract.extract(url)
    subdomain, domain, suffix = extracted
    # add both versions of domain.suffix and subdomain.domain.suffix
    full = ""
    # with subdomain
    if len(subdomain) > 0 and len(suffix) > 0:
        #print(f"{subdomain}.{domain}.{suffix}")
        full = f"{subdomain}.{domain}.{suffix}"
        if len(full) > 0:
            hostnames.add(full[4:].strip('/')) if full.startswith("www.") else hostnames.add(full.strip('/'))
    # without subdomain
    full = f"{domain}.{suffix}"
    if len(full) > 0 and len(suffix) > 0:
        hostnames.add(full[4:].strip('/')) if full.startswith("www.") else hostnames.add(full.strip('/'))
    return hostnames

In [10]:
zst_files = ["RS_2021-02.zst", "RS_2021-03.zst", "RS_2021-04.zst", "RS_2021-05.zst", "RS_2021-06.zst", "RS_2021-01.zst"]
zst_filepath = "E:/thesis_data/"

In [14]:
subreddits_total_activity = defaultdict(dict)

In [19]:
print("start time:", datetime.datetime.now())


for zst_file in zst_files:
    counter = 0
    added = 0
    month = zst_file[8:10]
    with open ("E:/relevant_posts_{}.ndjson".format(month), "a", encoding="utf-8") as ndjfile:
        writer = ndjson.writer(ndjfile, ensure_ascii=False)
        print("***** Start processing for {} *****".format(zst_file))
        with open(zst_filepath+zst_file, 'rb') as ifh:
            with dctx.stream_reader(ifh, read_size=2) as reader:
                text_stream = io.TextIOWrapper(reader, encoding='utf-8')
                for d in text_stream:
                    line = json.loads(d)
                    URLs = findURLs(line['url']) + findURLs(line['selftext'])                
                    hostnames = [get_hostname(url) for url in URLs]
                    URLs = [element for sub in hostnames for element in sub]
                    subreddit = line["subreddit"]
                    if month in subreddits_total_activity[subreddit]:
                        subreddits_total_activity[subreddit][month] += 1
                    else:
                        subreddits_total_activity[subreddit][month] = 1
                    # print(URLs)
                    for url in URLs:
                        if url in gmm_news_sources:
                            writer.writerow(line)
                            added += 1
                            break # write this post only once
                    url_of_our_ns = False
                    counter += 1
                    if counter%500000 == 0: 
                        print("processed {}; added {}; \t by {}".format(counter, added, str(datetime.datetime.now())[11:19]))
        
        print("****************************************** Summary ******************************************")
        print(f"There are {counter} total posts in {month}, and {added} of them have urls to our news sources.")
        
        print("----------------------------------------------------------------------------------------")
        print("-------------------------------- Done processing for {} --------------------------------".format(zst_file))
        print("----------------------------------------------------------------------------------------")
                
print("finish time:", datetime.datetime.now())

start time: 2022-03-01 01:15:20.614207
***** Start processing for RS_2021-02.zst *****
processed 500000; added 15206; 	 by 01:16:28
processed 1000000; added 31757; 	 by 01:17:33
processed 1500000; added 46774; 	 by 01:18:38
processed 2000000; added 64900; 	 by 01:19:46
processed 2500000; added 82193; 	 by 01:20:55
processed 3000000; added 100162; 	 by 01:22:01
processed 3500000; added 116898; 	 by 01:23:06
processed 4000000; added 132737; 	 by 01:24:12
processed 4500000; added 151805; 	 by 01:25:18
processed 5000000; added 168203; 	 by 01:26:23
processed 5500000; added 187077; 	 by 01:27:30
processed 6000000; added 203708; 	 by 01:28:35
processed 6500000; added 218943; 	 by 01:29:39
processed 7000000; added 232506; 	 by 01:30:46
processed 7500000; added 247618; 	 by 01:31:52
processed 8000000; added 261426; 	 by 01:32:59
processed 8500000; added 277524; 	 by 01:34:05
processed 9000000; added 294940; 	 by 01:35:11
processed 9500000; added 312222; 	 by 01:36:19
processed 10000000; added 

processed 11000000; added 374472; 	 by 04:03:30
processed 11500000; added 387912; 	 by 04:04:38
processed 12000000; added 407595; 	 by 04:05:47
processed 12500000; added 424320; 	 by 04:06:55
processed 13000000; added 444953; 	 by 04:08:05
processed 13500000; added 462213; 	 by 04:09:13
processed 14000000; added 482205; 	 by 04:10:21
processed 14500000; added 500588; 	 by 04:11:29
processed 15000000; added 520377; 	 by 04:12:38
processed 15500000; added 538606; 	 by 04:13:46
processed 16000000; added 557525; 	 by 04:14:54
processed 16500000; added 575741; 	 by 04:16:01
processed 17000000; added 592386; 	 by 04:17:08
processed 17500000; added 606400; 	 by 04:18:16
processed 18000000; added 621456; 	 by 04:19:22
processed 18500000; added 635447; 	 by 04:20:30
processed 19000000; added 653261; 	 by 04:21:37
processed 19500000; added 670837; 	 by 04:22:46
processed 20000000; added 689119; 	 by 04:23:52
processed 20500000; added 707638; 	 by 04:25:00
processed 21000000; added 724600; 	 by 0

processed 19000000; added 446134; 	 by 06:53:44
processed 19500000; added 457574; 	 by 06:54:50
processed 20000000; added 472174; 	 by 06:55:58
processed 20500000; added 483208; 	 by 06:57:05
processed 21000000; added 495841; 	 by 06:58:12
processed 21500000; added 506892; 	 by 06:59:16
processed 22000000; added 516365; 	 by 07:00:21
processed 22500000; added 525767; 	 by 07:01:28
processed 23000000; added 534599; 	 by 07:02:32
processed 23500000; added 543898; 	 by 07:03:40
processed 24000000; added 554253; 	 by 07:04:46
processed 24500000; added 567282; 	 by 07:05:54
processed 25000000; added 578592; 	 by 07:07:00
processed 25500000; added 592636; 	 by 07:08:08
processed 26000000; added 605019; 	 by 07:09:16
processed 26500000; added 620196; 	 by 07:10:25
processed 27000000; added 634158; 	 by 07:11:34
processed 27500000; added 647865; 	 by 07:12:43
processed 28000000; added 662193; 	 by 07:13:51
processed 28500000; added 675309; 	 by 07:15:00
processed 29000000; added 688749; 	 by 0

In [27]:
len(subreddits_total_activity)

2361255

In [37]:
subreddits_total_activity["twilight"]

{'02': 490, '03': 537, '04': 478, '05': 455, '06': 451, '01': 589}

In [21]:
with open("subreddits_total_activity.json", "w", encoding="utf-8") as af:
    json.dump(subreddits_total_activity, af)

```
Month            |    with news  |      total    |    % url with news    |
-------------------------------------------------------------------------|
January          |    1137576    |    32704571   |         3.48%         |
February         |    1026958    |    31147947   |         3.29%         |
March            |    1155554    |    33006103   |         3.50%         |
April            |    1090699    |    31616206   |         3.45%         |
May              |    1019503    |    36310673   |         2.81%         |
June             |     811758    |    34118481   |         2.38%         |
```

In [26]:
811758/34118481

0.023792325338282204

In [26]:
"wikipedia.org" in gmm_news_sources

True

In [18]:
counter

235400