In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import itertools
import swifter
import pickle
import random
import gzip
import json
import zstd

class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]

In [None]:
data_d = '/scratch/gpfs/ij9216/projects/data/misc/youNiverse/data'

df = pd.read_feather(f"{data_d}/yt_metadata_helper.feather")
df2 = pd.read_csv(f"{data_d}/df_channels_en.tsv.gz", compression="infer", sep="\t")
num_to_display_id = {v: k for k, v in df.display_id.to_dict().items()}
num_to_channel_id = {v: k for k, v in df2.channel.to_dict().items()}
num_comms = {k: None for k, v in num_to_display_id.items()}

In [None]:
# First pass: read comments
reader = Zreader(f"{data_d}/youtube_comments.ndjson.zst", chunk_size=16384)

idx = 0

# reads each line from the reader
for line in reader.readlines():
    idx += 1
    
    if idx % 10000000 == 0:
        print(idx)

    try:
        line = line.split(",")
        video_id = line[2][1:-1] 
        author = line[0]
    except:
        print("error parsing line")
        continue
        
    if video_id not in num_comms:
        continue
        
    if num_comms[video_id] is None:
        num_comms[video_id] = 1
    else:
        num_comms[video_id] += 1
        
comm_series = pd.Series(num_comms)
num_comments = pd.DataFrame(comm_series).reset_index()
num_comments.columns = ["display_id", "num_comms"]
num_comments.to_csv("f{data_d}/num_comments.tsv.gz", compression="infer", sep="\t", index=False)

In [None]:
# Second pass: prints comments with for videos with more than 5 comments
import gzip
reader = Zreader(f"{data_d}/youtube_comments.ndjson.zst", chunk_size=16384)

author_orig = ""
author_count = 0
idx = 0
with gzip.open(f"{data_d}/youtube_comments.tsv.gz", "w") as f:
    
    f.write(("\t".join(["author", "video_id", "likes", "replies"]) + "\n").encode())

    # reads each line from the reader
    for line in reader.readlines():
        idx += 1
        
        if idx == 1:
            continue

        if idx % 10000000 == 0:
            print(idx)

        try:
            line = line.split(",")
            video_id = line[2][1:-1] 
            author = line[0]

            likes = line[5]
            replies = line[6]
        except:
            print("error parsing line")
            continue
            
        if author != author_orig:
            author_orig = author
            author_count += 1

        if video_id in num_comms and num_comms[video_id] > 30:
            f.write(("\t".join([str(author_count), video_id, str(likes), str(replies)])+ "\n").encode())


In [None]:
# Third pass: author per comment helper
tmp_authors = []
for df in  pd.read_csv(f"{data_d}/youtube_comments.tsv.gz", compression="infer", sep="\t", 
                       chunksize=10000000):
    print("x")
    tmp_authors.append(df.groupby("author").video_id.count())
tmp = pd.concat(tmp_authors)
tmp = tmp.reset_index()
num_comments_author = tmp.groupby("author").video_id.sum()
num_comments_author.reset_index().to_csv(f"{data_d}/num_comments_authors.tsv.gz", compression="infer", 
                           sep="\t", index=False)

---