# Submission Schema Creation

In [14]:
import pandas as pd
from tqdm import tqdm
from pathlib import Path

from redarch.dev.io import ZST_LINES, ZST_JSONL

Pass path to submissions files

In [4]:
reddit_submissions_dir = Path.home() / "user/data/external/pushshift/reddit/submissions"

Define field analysis function

In [5]:
def analyze_field_types(
    parent_dir: Path,
    samples_per_file: int = 10_000,
) -> pd.DataFrame:
    """"""
    files = list(Path(parent_dir).glob("*.zst"))
    samples = list()
    for path in tqdm(files, desc="Loading Samples"):
        reader = ZST_JSONL(path)
        samples.extend(reader.sample(stop=samples_per_file, progress=False))

    fields: dict[str, dict[str, int | set[str]]] = dict()
    for sample in tqdm(samples, desc="Assessing Types"):
        for k, v in sample.items():
            if k not in fields:
                fields[k] = {
                    "types": {v.__class__.__name__},
                    "count.has.key": 1,
                    "count.has.value": int(v.__class__.__name__ != "NoneType"),
                }
            else:
                fields[k]["types"].add(v.__class__.__name__)  # type: ignore
                fields[k]["count.has.key"] += 1  # type: ignore
                fields[k]["count.has.value"] += int(v.__class__.__name__ != "NoneType")  # type: ignore
    n_samples = len(samples)
    stats_df = pd.DataFrame(
        [
            {
                "field.name": key,
                "field.types": val["types"],
                "count.has.key": val["count.has.key"],
                "count.has.value": val["count.has.value"],
            }
            for key, val in fields.items()
        ]
    )
    stats_df["perc.has.key"] = stats_df["count.has.key"] / n_samples
    stats_df["perc.has.value"] = stats_df["count.has.value"] / stats_df["count.has.key"]
    # clear memory
    del samples, fields
    return stats_df

Run over samples

In [6]:
ftdf = analyze_field_types(reddit_submissions_dir, 1000)

Loading Samples: 100%|██████████| 213/213 [00:02<00:00, 79.78it/s] 
Assessing Types: 100%|██████████| 212315/212315 [00:02<00:00, 72150.70it/s]


## Combinatoric Field Sets

- A: All Keys Present | `perc.has.key == 1`
    - A: All Values Present | `perc.has.value == 1`
    - S: Some Values Present | `0 < perc.has.value < 1`
    - N: No Values Present | `perc.has.value == 0`
- S: Some Keys Present | `perc.has.key < 1.0`
    - A: All Values Present (where key is present) | `perc.has.value == 1`
    - S: Some Values Present (where key is present) | `0 < perc.has.value < 1`
    - N: No Values Present (where key is present) | `perc.has.value == 0`
- N: No Keys Present -> doesn't make sense, the set of missing keys is the infinite set of all possible keys that are not represented in this dataset

Six possibilities: AA, AS, AN, SA, SS, SN

Or, another way:

- AA: Required field
- AS/SA/SS: Optional field
- AN/SN: Drop these fields, no data available just an empty field.



In [7]:
required_fields = ftdf[(ftdf["perc.has.key"] == 1) & (ftdf["perc.has.value"] == 1)]
optional_fields = ftdf[(ftdf["perc.has.key"] != 1) & (ftdf["perc.has.value"] > 0)]
dropped_fields = ftdf[ftdf["perc.has.value"] == 0]

In [8]:
required_fields

Unnamed: 0,field.name,field.types,count.has.key,count.has.value,perc.has.key,perc.has.value
4,is_self,{bool},212315,212315,1.0,1.0
5,created_utc,"{str, int}",212315,212315,1.0,1.0
6,selftext,{str},212315,212315,1.0,1.0
10,title,{str},212315,212315,1.0,1.0
18,score,{int},212315,212315,1.0,1.0
19,permalink,{str},212315,212315,1.0,1.0
20,over_18,{bool},212315,212315,1.0,1.0
21,num_comments,{int},212315,212315,1.0,1.0
22,id,{str},212315,212315,1.0,1.0
25,media_embed,{dict},212315,212315,1.0,1.0


In [10]:
dropped_fields.head(10)

Unnamed: 0,field.name,field.types,count.has.key,count.has.value,perc.has.key,perc.has.value
16,banned_by,{NoneType},58928,0,0.27755,0.0
30,report_reasons,{NoneType},18888,0,0.088962,0.0
76,removed_by,{NoneType},28028,0,0.132011,0.0
92,view_count,{NoneType},23023,0,0.108438,0.0
118,adserver_click_url,{NoneType},252,0,0.001187,0.0
119,adserver_imp_pixel,{NoneType},252,0,0.001187,0.0
123,embed_type,{NoneType},220,0,0.001036,0.0
124,embed_url,{NoneType},220,0,0.001036,0.0
130,original_link,{NoneType},229,0,0.001079,0.0
132,priority_id,{NoneType},205,0,0.000966,0.0


In [11]:
optional_fields.head(10)

Unnamed: 0,field.name,field.types,count.has.key,count.has.value,perc.has.key,perc.has.value
0,ups,{int},74944,74944,0.352985,1.0
7,subreddit,{str},212268,212268,0.999779,1.0
8,stickied,{bool},192295,192295,0.905706,1.0
9,subreddit_id,"{str, NoneType}",212286,212268,0.999863,0.999915
11,author,{str},212296,212296,0.999911,1.0
13,secure_media,"{dict, NoneType}",192295,11131,0.905706,0.057885
23,user_reports,{list},18888,18888,0.088962,1.0
24,gilded,{int},192295,192295,0.905706,1.0
26,secure_media_embed,{dict},192295,192295,0.905706,1.0
28,downs,{int},74944,74944,0.352985,1.0


Can now deterministically generate output schema

In [12]:
def format_types(types: set[str], optional: bool = False) -> str:
    if len(types) == 1:
        typehint = list(types)[0]
    else:
        typehint = " | ".join([t for t in types if t != "NoneType"])
    if optional:
        return f"{typehint} | None = None"
    else:
        return typehint


struct_template = """
class Submission(Struct):

    # Required Fields
{required_field_attributes}

    # Optional Fields
{optional_field_attributes}
""".strip()

required_field_attributes = list()
for field in required_fields.to_dict(orient="records"):
    required_field_attributes.append(
        f"    {field['field.name']}: {format_types(field['field.types'])}"
    )
required_field_attributes = "\n".join(required_field_attributes)
optional_field_attributes = list()
for field in optional_fields.to_dict(orient="records"):
    optional_field_attributes.append(
        f"    {field['field.name']}: {format_types(field['field.types'], optional=True)}"
    )
optional_field_attributes = "\n".join(optional_field_attributes)

In [13]:
print(
    struct_template.format(
        required_field_attributes=required_field_attributes,
        optional_field_attributes=optional_field_attributes,
    ),
)

class Submission(Struct):

    # Required Fields
    is_self: bool
    created_utc: str | int
    selftext: str
    title: str
    score: int
    permalink: str
    over_18: bool
    num_comments: int
    id: str
    media_embed: dict
    edited: bool | int | float
    thumbnail: str

    # Optional Fields
    ups: int | None = None
    subreddit: str | None = None
    stickied: bool | None = None
    subreddit_id: str | None = None
    author: str | None = None
    secure_media: dict | None = None
    user_reports: list | None = None
    gilded: int | None = None
    secure_media_embed: dict | None = None
    downs: int | None = None
    mod_reports: list | None = None
    selftext_html: str | None = None
    retrieved_on: int | None = None
    all_awardings: list | None = None
    allow_live_comments: bool | None = None
    archived: bool | None = None
    author_created_utc: int | None = None
    author_flair_background_color: str | None = None
    author_flair_template_id: str | No

Just copy output to get:

In [16]:
from msgspec import Struct

In [17]:
class Submission(Struct):

    # Required Fields
    is_self: bool
    created_utc: int | str
    selftext: str
    title: str
    score: int
    permalink: str
    over_18: bool
    num_comments: int
    id: str
    media_embed: dict
    edited: int | bool | float
    thumbnail: str

    # Optional Fields
    ups: int | None = None
    subreddit: str | None = None
    stickied: bool | None = None
    subreddit_id: str | None = None
    author: str | None = None
    secure_media: dict | None = None
    user_reports: list | None = None
    gilded: int | None = None
    secure_media_embed: dict | None = None
    downs: int | None = None
    mod_reports: list | None = None
    selftext_html: str | None = None
    retrieved_on: int | None = None
    all_awardings: list | None = None
    allow_live_comments: bool | None = None
    archived: bool | None = None
    author_created_utc: int | None = None
    author_flair_background_color: str | None = None
    author_flair_template_id: str | None = None
    author_flair_text_color: str | None = None
    awarders: list | None = None
    can_gild: bool | None = None
    can_mod_post: bool | None = None
    category: str | None = None
    content_categories: list | None = None
    contest_mode: bool | None = None
    discussion_type: str | None = None
    gildings: dict | None = None
    hidden: bool | None = None
    hide_score: bool | None = None
    is_created_from_ads_ui: bool | None = None
    is_crosspostable: bool | None = None
    is_meta: bool | None = None
    is_original_content: bool | None = None
    is_reddit_media_domain: bool | None = None
    is_robot_indexable: bool | None = None
    is_video: bool | None = None
    link_flair_background_color: str | None = None
    link_flair_richtext: list | None = None
    link_flair_template_id: str | None = None
    link_flair_text_color: str | None = None
    link_flair_type: str | None = None
    locked: bool | None = None
    media_only: bool | None = None
    name: str | None = None
    no_follow: bool | None = None
    num_crossposts: int | None = None
    parent_whitelist_status: str | None = None
    pinned: bool | None = None
    pwls: int | None = None
    quarantine: bool | None = None
    removed_by_category: str | None = None
    retrieved_utc: int | None = None
    send_replies: bool | None = None
    spoiler: bool | None = None
    subreddit_name_prefixed: str | None = None
    subreddit_subscribers: int | None = None
    subreddit_type: str | None = None
    suggested_sort: str | None = None
    thumbnail_height: int | None = None
    thumbnail_width: int | None = None
    top_awarded_type: str | None = None
    total_awards_received: int | None = None
    treatment_tags: list | None = None
    upvote_ratio: float | None = None
    url_overridden_by_dest: str | None = None
    whitelist_status: str | None = None
    wls: int | None = None
    post_hint: str | None = None
    preview: dict | None = None
    author_flair_richtext: list | None = None
    author_flair_type: str | None = None
    author_fullname: str | None = None
    author_patreon_flair: bool | None = None
    author_premium: bool | None = None
    crosspost_parent: str | None = None
    crosspost_parent_list: list | None = None
    media_metadata: dict | None = None
    brand_safe: bool | None = None
    rte_mode: str | None = None
    removal_reason: str | None = None
    saved: bool | None = None
    created: int | float | None = None
    clicked: bool | None = None
    likes: bool | None = None
    visited: bool | None = None