## Notebook to create subset of Oscar22 domains that are within Curlie and end with  .de

- Author: Hadi Asghari
- Version: 2023.02

- __input__: curlie-ourset.csv  &  oscar 22.01 data 
- __output__: oscar22-subset.jsonl (~100GB uncompressed)


In [1]:
# imports
import json
import pickle
import binascii
import zlib
from time import time
from collections import Counter
from os import path
import pandas as pd
import numpy as np
import tldextract

**Downloading OSCAR 22 data**

- The Oscar dataset is a curated version of the common crawl.
- You can download this data from: https://oscar-project.github.io/documentation/versions/oscar-2201/
- Fields of interest include `content` & `warc-headers/warc-target-uri`


In [None]:
curlie = pd.read_csv(".data/curlie-ourset.csv")

In [None]:
# GO OVER THE JSONL OSCAR DATA AND KEEP ROWS WITH URLS IN OUR FILTER SET

# NOTE, Following code requires around 64GB RAM to run
# (this is with the aid of zlib string compression)

DATADIR = "./data/oscar22/"
PARTS = 497
filter_set = set(curlie.domain)
subset, uris, ignored = dict(), Counter(), set()
dates = Counter()

st = time()
for part in range (1, PARTS+1):
    # each _part_ takes about 11s, and has about 150k records 
    print(part, end=' .. ')
    with open(DATADIR + f"de_meta_part_{part}.jsonl") as f:
        for js in f.readlines():
            m = json.loads(js)
            content = m['content']
            uri = m['warc_headers']['warc-target-uri']
            dt = m['warc_headers']['warc-date'][:10]

            # check if URI within our target
            fdomain = tldextract.extract(uri).fqdn.lower()
            if fdomain.startswith("www."):
                fdomain = fdomain[4:]
            if domain.startswith("m."):
                fdomain = fdomain[2:]
            if domain.startswith("de."):
                fdomain = fdomain[3:]
            rdomain = tldextract.extract(domain).registered_domain

            if fdomain not in filter_set and rdomain not in filter_set:
                ignored.add(uri)
                continue  # missing!

            # stats re zlib compression (reduces memory to a third and relatively fast)
            content = zlib.compress(bytes(content, 'utf-8'))  

            dates[dt] += 1
            uris[uri] += 1
            if not uri in subset:
                subset[uri] = content
            else:
                # if exists: keep all content versions in a list (unless duplicate content)
                if not type(subset[uri]) is list:
                    subset[uri] = [subset[uri]]
                dupdup = False
                for c in subset[uri]:
                    if c == content:
                        dupdup = True
                if not dupdup:
                    subset[uri].append(content)
                else:
                    # exact duplicate content, ignore this item, and possibly flatten list
                    if len(subset[uri]) == 1:
                        subset[uri] = subset[uri][0]
                        uris[uri] -= 1
        # break
#
print("=> done in ", round(time()-st), "seconds.\n")  # ~6000s 
print("Dates: ", min(dates), max(dates))  # crawl dates (two weeks between 26.11.2021 to 09.12.2021)

# interim save for reuse
# with open("subset-curlide.p", "wb") as f:
#    pickle.dump(subset, f)

# stats
print("URIs in/out:", len(uris), len(ignored))
dups = [k for k,v in uris.items() if v>1]
print("Duplicate URIs:", len(dups))  # 32804  (<1%)

In [None]:
# SAVE THE FILTERED SUBSET TO A NEW JSONL FILE
# - regarding duplicate URIs: we keep the first (<1%)
# - the jsonl's will be uncompressed (size: ~100GB)
# - we'll keep the default dictionary order (which is as read/added in py3.10+) vs sorting by domain or category;

if not 'curix' in locals():
    curix = curlie.set_index('domain')

st = time()

OUTFILE = "oscar22-subset.jsonl"
assert not path.exists(OUTFILE)
with open(OUTFILE, "wt") as f:
    for i, (url, content) in enumerate(subset.items()):
        if i%100000 == 0:  # takes around ~12s
            print(".", end="", flush=True)
        if type(content) is list:
            content = content[0]
        content = zlib.decompress(content).decode('utf-8')
        domain = tldextract.extract(url).registered_domain.lower()
        domcat = curix.loc[domain, 'cat']
        js = json.dumps({'url': url, 'domain': domain, 'domcat': domcat, 'content': content})
        f.write(js + "\n")
        #break

print("\n=> wrote jsonl in", round(time()-st), "secs.")  # 1922s