# Load JSONL data from a ZST archive

Relevant imports:

In [1]:
import os
import pandas as pd
from typing import Any
from pathlib import Path
from redarch.dev.io import ZST_JSONL

## Archive reader

Replace the following path with the directory to the ZST archives you want to load from.

In [2]:
reddit_submissions_dir = Path.home() / "user/data/external/pushshift/reddit/submissions"

It should look something like this.

In [3]:
os.listdir(reddit_submissions_dir)[:3]

['RS_2013-09.zst', 'RS_2022-01.zst', 'RS_2008-05.zst']

This is the file I'm using here.

In [4]:
post_files: list[Path] = list(Path(reddit_submissions_dir).glob("*.zst"))
sample_file: Path = post_files[0]
print(sample_file.name)

RS_2013-09.zst


We'll pass that path to the `ZST_JSONL` to create a reader instance.

In [5]:
reader = ZST_JSONL(sample_file)

## Basic usage

The reader works more or less like most other pythonic file readers. For instance, you can read in each line like so:


In [6]:
reader = ZST_JSONL(sample_file)

l0 = reader.readline()
print(0, str(l0)[:100], "...")
l1 = reader.readline()
print(1, str(l1)[:100], "...")

0 {'ups': 2, 'author_flair_css_class': None, 'media': None, 'link_flair_css_class': None, 'is_self': T ...
1 {'secure_media_embed': {}, 'media_embed': {}, 'gilded': 0, 'num_comments': 0, 'id': '1ljnek', 'user_ ...


Or read multiple lines to a list:

In [7]:
reader = ZST_JSONL(sample_file)

threelines = reader.readlines(n=3)
print(len(threelines))

3


You can iterate directly over each JSON line object in the ZST file directly:

```python
for obj in reader:
    ...
```

For example, printing some lines (just the keys from the lines to avoid console overload):

In [8]:
reader = ZST_JSONL(sample_file)

count = 0
for obj in reader:
    print(obj.keys())
    if count == 3:
        break
    count += 1

dict_keys(['ups', 'author_flair_css_class', 'media', 'link_flair_css_class', 'is_self', 'created_utc', 'selftext', 'subreddit', 'stickied', 'subreddit_id', 'title', 'author', 'url', 'secure_media', 'link_flair_text', 'author_flair_text', 'banned_by', 'domain', 'score', 'permalink', 'over_18', 'num_comments', 'id', 'user_reports', 'gilded', 'media_embed', 'secure_media_embed', 'edited', 'downs', 'distinguished', 'report_reasons', 'thumbnail', 'mod_reports', 'selftext_html', 'retrieved_on'])
dict_keys(['secure_media_embed', 'media_embed', 'gilded', 'num_comments', 'id', 'user_reports', 'over_18', 'permalink', 'score', 'domain', 'retrieved_on', 'selftext_html', 'mod_reports', 'thumbnail', 'report_reasons', 'distinguished', 'downs', 'edited', 'created_utc', 'selftext', 'is_self', 'media', 'link_flair_css_class', 'author_flair_css_class', 'ups', 'author_flair_text', 'banned_by', 'link_flair_text', 'secure_media', 'url', 'subreddit_id', 'title', 'author', 'stickied', 'subreddit'])
dict_keys(

You can reset the cursor to the start of the file with:

In [9]:
reader.reset()

## Reading a sample

In [10]:
# utility function to avoid console overload


def fdict(d: dict, keys: list[Any]) -> dict:
    return {k: d[k] for k in keys if k in d.keys()}

You can load in a sample of posts like this (see the docstring for more info).

In [11]:
data = reader.sample(stop=20)

100%|██████████| 20/20 [00:00<00:00, 4796.23it/s]


Here's an example.

In [12]:
fdict(data[0], ["title", "subreddit", "url"])

{'title': 'Tell me reddit, what is on your mind right now?',
 'subreddit': 'AskReddit',
 'url': 'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/'}

It's important to note that by default, when you read a sample of data, the stream is reinstantiated internally. As such, read from where you left off, set `reset=False`.

In [13]:
reader = ZST_JSONL(sample_file)

In [14]:
reader.sample(stop=0, reset=False)[0]["url"]

0it [00:00, ?it/s]


'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/'

In [15]:
reader.sample(stop=0, reset=False)[0]["url"]

0it [00:00, ?it/s]


'http://www.reddit.com/r/A858DE45F56D9BC9/comments/1ljnek/201309020359/'

 To ensure the sample is read from the start, just leave it as the default.

In [16]:
reader = ZST_JSONL(sample_file)

In [17]:
reader.sample(stop=0)[0]["url"]

0it [00:00, ?it/s]


'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/'

In [18]:
reader.sample(stop=0)[0]["url"]

0it [00:00, ?it/s]


'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/'

We can pass a custom parser to control the data structure. Which helps with filtering out unneeded data and validating object schems. A simple example is included below.

In [19]:
def custom_parser(obj: dict[str, Any]) -> dict[str, Any]:
    return {
        "subreddit": obj["subreddit"],
        "score": obj["score"],
    }

In [20]:
data = reader.sample(stop=10_000, parser=custom_parser)

100%|██████████| 10000/10000 [00:00<00:00, 194526.57it/s]


In [21]:
data[:3]

[{'subreddit': 'AskReddit', 'score': 2},
 {'subreddit': 'A858DE45F56D9BC9', 'score': 5},
 {'subreddit': 'videos', 'score': 0}]

In [22]:
df = pd.DataFrame(data)

In [23]:
df.head(10)

Unnamed: 0,subreddit,score
0,AskReddit,2
1,A858DE45F56D9BC9,5
2,videos,0
3,tits,6
4,WritersRoom,1
5,Invites,3
6,FreeEBOOKS,2
7,foxes,5
8,politics,1
9,Meditation,6


In [24]:
# count the number of submissions from each subreddit in this small sample
post_count = df["subreddit"].value_counts()
post_count.head(10)

subreddit
funny              390
AskReddit          387
pics               265
leagueoflegends    244
AdviceAnimals      240
gaming             176
aww                141
POLITIC            122
WTF                113
videos             108
Name: count, dtype: int64

In [25]:
# get the average score per post for each subreddit, for subreddits where the post count >= 100 in this sample
mean_post_score = (
    (df.groupby("subreddit").sum() / df.groupby("subreddit").count())
    .sort_values(by="score", ascending=False)
    .reset_index()
)
only_subreddits = post_count[post_count >= 100].index.tolist()
mean_post_score = mean_post_score[
    mean_post_score["subreddit"].apply(lambda x: x in only_subreddits)
].reset_index(drop=True)
mean_post_score.columns = ["subreddit", "avg_post_score"]

In [26]:
mean_post_score

Unnamed: 0,subreddit,avg_post_score
0,WTF,86.690265
1,funny,65.984615
2,AdviceAnimals,63.383333
3,gaming,57.176136
4,pics,51.633962
5,aww,42.595745
6,leagueoflegends,16.028689
7,videos,12.768519
8,AskReddit,5.780362
9,fantasyfootball,3.932692
