In [1]:
%matplotlib inline

from glob import glob
import json
import multiprocessing as mp
import os
import numpy as np

from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

DATASET_PATTERN = "../../dataset/ShareGPT/*.txt"

In [2]:
# Multiprocess data loading
def _load_data_worker(pipe: mp.Pipe):
    jobs = pipe.recv()
    result = []

    for filename in jobs:
        with open(filename) as f:
            bs = BeautifulSoup(f, "html.parser", from_encoding="utf-8")

        json_content = bs.find("script", {"id": "__NEXT_DATA__"})
        result.append(json.loads(json_content.decode_contents()))

    pipe.send(result)


def _split_list(a: list, n: int):
    # Split list a to n chunks
    # https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length

    k, m = divmod(len(a), n)
    return [a[i*k+min(i, m): (i+1)*k+min(i+1, m)] for i in range(n)]


def load_data(filename_list, n_jobs=os.cpu_count() - 1):
    pipes = []
    for text_chunk in _split_list(filename_list, n_jobs):
        pipe_master, pipe_child = mp.Pipe()
        mp.Process(target=_load_data_worker, args=(pipe_child, )).start()

        pipe_master.send(text_chunk)
        pipes.append(pipe_master)

    result = []
    for pipe in pipes:
        result.extend(pipe.recv())

    return result


# Read all data
dataset = load_data(glob(DATASET_PATTERN))

In [3]:
# GPT-4 statistics

models = [x["props"]["pageProps"].get("content", {}).get("model") for x in dataset]

print ("Num of GPT-4: ", len([x for x in models if x == "Model: GPT-4"]))

Num of GPT-4:  6745


In [10]:
# View statistics

views = np.array([x["props"]["pageProps"].get("views", 0) for x in dataset])

np.sort(views)[-5000]


20

In [5]:
file_list = glob(DATASET_PATTERN)

for idx, x in enumerate(dataset):
    if "content" not in x["props"]["pageProps"]:
        print(file_list[idx])

../../dataset/ShareGPT/8qTtJXe.txt
../../dataset/ShareGPT/DoscKXg.txt
../../dataset/ShareGPT/sLrHEWi.txt


In [14]:
dataset[0]["props"]["pageProps"]["views"]

1