# Load JSONL data from a ZST archive

Relevant imports:

In [1]:
import os
import pandas as pd
from typing import Any
from pathlib import Path
from redarch.dev.io import ZST_JSONL

## Archive reader

Replace the following path with the directory to the ZST archives you want to load from.

In [2]:
reddit_submissions_dir = Path.home() / "user/data/external/pushshift/reddit/submissions"

It should look something like this.

In [3]:
os.listdir(reddit_submissions_dir)[:3]

['RS_2013-09.zst', 'RS_2022-01.zst', 'RS_2008-05.zst']

This is the file I'm using here.

In [4]:
post_files: list[Path] = list(Path(reddit_submissions_dir).glob("*.zst"))
sample_file: Path = post_files[0]
print(sample_file.name)

RS_2013-09.zst


We'll pass that path to the `ZST_JSONL` to create a reader instance.

In [5]:
reader = ZST_JSONL(sample_file)

## Basic usage

The reader works more or less like most other pythonic file readers. For instance, you can read in each line like so:


In [6]:
reader = ZST_JSONL(sample_file)

l0 = reader.readline()
print(0, l0)
l1 = reader.readline()
print(1, l1)

0 {'ups': 2, 'author_flair_css_class': None, 'media': None, 'link_flair_css_class': None, 'is_self': True, 'created_utc': 1378079998, 'selftext': '', 'subreddit': 'AskReddit', 'stickied': False, 'subreddit_id': 't5_2qh1i', 'title': 'Tell me reddit, what is on your mind right now?', 'author': '98092834092', 'url': 'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/', 'secure_media': None, 'link_flair_text': None, 'author_flair_text': None, 'banned_by': None, 'domain': 'self.AskReddit', 'score': 2, 'permalink': '/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/', 'over_18': False, 'num_comments': 5, 'id': '1ljnel', 'user_reports': [], 'gilded': 0, 'media_embed': {}, 'secure_media_embed': {}, 'edited': False, 'downs': 0, 'distinguished': None, 'report_reasons': None, 'thumbnail': 'self', 'mod_reports': [], 'selftext_html': None, 'retrieved_on': 1411870497}
1 {'secure_media_embed': {}, 'media_embed': {}, 'gilded': 0, '

Or read multiple lines to a list:

In [7]:
reader = ZST_JSONL(sample_file)

threelines = reader.readlines(n=3)
print(threelines)

[{'ups': 2, 'author_flair_css_class': None, 'media': None, 'link_flair_css_class': None, 'is_self': True, 'created_utc': 1378079998, 'selftext': '', 'subreddit': 'AskReddit', 'stickied': False, 'subreddit_id': 't5_2qh1i', 'title': 'Tell me reddit, what is on your mind right now?', 'author': '98092834092', 'url': 'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/', 'secure_media': None, 'link_flair_text': None, 'author_flair_text': None, 'banned_by': None, 'domain': 'self.AskReddit', 'score': 2, 'permalink': '/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/', 'over_18': False, 'num_comments': 5, 'id': '1ljnel', 'user_reports': [], 'gilded': 0, 'media_embed': {}, 'secure_media_embed': {}, 'edited': False, 'downs': 0, 'distinguished': None, 'report_reasons': None, 'thumbnail': 'self', 'mod_reports': [], 'selftext_html': None, 'retrieved_on': 1411870497}, {'secure_media_embed': {}, 'media_embed': {}, 'gilded': 0, 'nu

You can iterate directly over each JSON line object in the ZST file directly:

```python
for obj in reader:
    ...
```

For example, printing some lines:

In [8]:
reader = ZST_JSONL(sample_file)

count = 0
for obj in reader:
    print(obj)
    if count == 3:
        break
    count += 1

{'ups': 2, 'author_flair_css_class': None, 'media': None, 'link_flair_css_class': None, 'is_self': True, 'created_utc': 1378079998, 'selftext': '', 'subreddit': 'AskReddit', 'stickied': False, 'subreddit_id': 't5_2qh1i', 'title': 'Tell me reddit, what is on your mind right now?', 'author': '98092834092', 'url': 'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/', 'secure_media': None, 'link_flair_text': None, 'author_flair_text': None, 'banned_by': None, 'domain': 'self.AskReddit', 'score': 2, 'permalink': '/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/', 'over_18': False, 'num_comments': 5, 'id': '1ljnel', 'user_reports': [], 'gilded': 0, 'media_embed': {}, 'secure_media_embed': {}, 'edited': False, 'downs': 0, 'distinguished': None, 'report_reasons': None, 'thumbnail': 'self', 'mod_reports': [], 'selftext_html': None, 'retrieved_on': 1411870497}
{'secure_media_embed': {}, 'media_embed': {}, 'gilded': 0, 'num_

You can reset the cursor to the start of the file with:

In [9]:
reader.reset()

## Reading a sample

You can load in a sample of posts like this (see the docstring for more info).

In [10]:
data = reader.sample(stop=20)

100%|██████████| 20/20 [00:00<00:00, 11737.24it/s]


Here's an example.

In [11]:
data[0]

{'ups': 2,
 'author_flair_css_class': None,
 'media': None,
 'link_flair_css_class': None,
 'is_self': True,
 'created_utc': 1378079998,
 'selftext': '',
 'subreddit': 'AskReddit',
 'stickied': False,
 'subreddit_id': 't5_2qh1i',
 'title': 'Tell me reddit, what is on your mind right now?',
 'author': '98092834092',
 'url': 'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/',
 'secure_media': None,
 'link_flair_text': None,
 'author_flair_text': None,
 'banned_by': None,
 'domain': 'self.AskReddit',
 'score': 2,
 'permalink': '/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/',
 'over_18': False,
 'num_comments': 5,
 'id': '1ljnel',
 'user_reports': [],
 'gilded': 0,
 'media_embed': {},
 'secure_media_embed': {},
 'edited': False,
 'downs': 0,
 'distinguished': None,
 'report_reasons': None,
 'thumbnail': 'self',
 'mod_reports': [],
 'selftext_html': None,
 'retrieved_on': 1411870497}

It's important to note that by default, when you read a sample of data, the stream is reinstantiated internally. As such, read from where you left off, set `reset=False`.

In [12]:
reader = ZST_JSONL(sample_file)

In [13]:
reader.sample(stop=0, reset=False)[0]["url"]

0it [00:00, ?it/s]


'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/'

In [14]:
reader.sample(stop=0, reset=False)[0]["url"]

0it [00:00, ?it/s]


'http://www.reddit.com/r/A858DE45F56D9BC9/comments/1ljnek/201309020359/'

 To ensure the sample is read from the start, just leave it as the default.

In [15]:
reader = ZST_JSONL(sample_file)

In [16]:
reader.sample(stop=0)[0]["url"]

0it [00:00, ?it/s]


'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/'

In [17]:
reader.sample(stop=0)[0]["url"]

0it [00:00, ?it/s]


'http://www.reddit.com/r/AskReddit/comments/1ljnel/tell_me_reddit_what_is_on_your_mind_right_now/'

We can pass a custom parser to control the data structure. Which helps with filtering out unneeded data and validating object schems. A simple example is included below.

In [18]:
def custom_parser(obj: dict[str, Any]) -> dict[str, Any]:
    return {
        "subreddit": obj["subreddit"],
        "score": obj["score"],
    }

In [19]:
data = reader.sample(stop=10_000, parser=custom_parser)

100%|██████████| 10000/10000 [00:00<00:00, 191233.44it/s]


In [20]:
data[:3]

[{'subreddit': 'AskReddit', 'score': 2},
 {'subreddit': 'A858DE45F56D9BC9', 'score': 5},
 {'subreddit': 'videos', 'score': 0}]

In [21]:
df = pd.DataFrame(data)

In [22]:
df.head(10)

Unnamed: 0,subreddit,score
0,AskReddit,2
1,A858DE45F56D9BC9,5
2,videos,0
3,tits,6
4,WritersRoom,1
5,Invites,3
6,FreeEBOOKS,2
7,foxes,5
8,politics,1
9,Meditation,6


In [23]:
# count the number of submissions from each subreddit in this small sample
post_count = df["subreddit"].value_counts()
post_count.head(10)

subreddit
funny              390
AskReddit          387
pics               265
leagueoflegends    244
AdviceAnimals      240
gaming             176
aww                141
POLITIC            122
WTF                113
videos             108
Name: count, dtype: int64

In [24]:
# get the average score per post for each subreddit, for subreddits where the post count >= 100 in this sample
mean_post_score = (
    (df.groupby("subreddit").sum() / df.groupby("subreddit").count())
    .sort_values(by="score", ascending=False)
    .reset_index()
)
only_subreddits = post_count[post_count >= 100].index.tolist()
mean_post_score = mean_post_score[
    mean_post_score["subreddit"].apply(lambda x: x in only_subreddits)
].reset_index(drop=True)
mean_post_score.columns = ["subreddit", "avg_post_score"]

In [25]:
mean_post_score

Unnamed: 0,subreddit,avg_post_score
0,WTF,86.690265
1,funny,65.984615
2,AdviceAnimals,63.383333
3,gaming,57.176136
4,pics,51.633962
5,aww,42.595745
6,leagueoflegends,16.028689
7,videos,12.768519
8,AskReddit,5.780362
9,fantasyfootball,3.932692
