In [1]:
import json
from transformers import LlamaTokenizerFast
import huggingface_hub
import os
from collections import defaultdict
import plotly.express as px
from dateutil.parser import parse
import pandas as pd

huggingface_hub.login(token=os.getenv("HUGGINGFACE_TOKEN"), new_session=False)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/leonseet/.cache/huggingface/token
Login successful


In [2]:
INPUT_FILE_PATH = "data/andrew_huberman_episodes_processed.json"
CHAT_MODEL = "meta-llama/Llama-2-7b-chat-hf"

In [3]:
with open(INPUT_FILE_PATH, "r") as file:
    data = json.load(file)
    
print(data[0])



# What is the distribution of episodes across categories? (Overlaps Included)

In [4]:
topics_dict = defaultdict(int)

for episode in data:
    topics = episode["topics"]
    if not topics:
        topics_dict["None"] += 1
    else:
        for topic in topics:
            topics_dict[topic] += 1

topics_dict = dict(sorted(topics_dict.items(), key=lambda item: item[1], reverse=False))
fig = px.bar(
    y=list(topics_dict.keys()),
    x=list(topics_dict.values()),
    labels={'y':'Categories', 'x':'Count'},
    orientation='h',
    text_auto=True)
fig.update_layout(autosize=False, width=800, height=800, title='What is the distribution of episodes across categories? (Overlaps Included)')
fig.show()

# What is the distribution of episodes across publish years?

In [5]:
year_dict = defaultdict(int)

for episode in data:
    created = episode["created"]
    
    if not created:
        year_dict["None"] += 1
    else:
        year = str(parse(created).year)
        year_dict[year] += 1

year_dict = dict(sorted(year_dict.items(), key=lambda item: item[1], reverse=False))
fig = px.bar(
    y=list(year_dict.keys()),
    x=list(year_dict.values()),
    labels={'y':'Year', 'x':'Count'},
    orientation='h',
    text_auto=True)
fig.update_layout(title='What is the distribution of episodes across publish years?')
fig.show()

# What is the distribution of episodes with guests?

In [6]:
guest_dict = defaultdict(int)

for episode in data:
    guest = episode["guest"]
    
    if not guest:
        guest_dict["No Guest"] += 1
    else:
        guest_dict["Guest"] += 1

guest_dict = dict(sorted(guest_dict.items(), key=lambda item: item[1], reverse=False))
fig = px.bar(
    y=list(guest_dict.keys()),
    x=list(guest_dict.values()),
    labels={'y':'Guest', 'x':'Count'},
    orientation='h',
    text_auto=True)
fig.update_layout(title='What is the distribution of episodes with guests?')
fig.show()

# Which guest came on the show the most? (Top 10)

In [7]:
data_dict = defaultdict(int)

for episode in data:
    guest = episode["guest"]
    
    if guest:
        data_dict[guest] += 1

data_dict = dict(sorted(data_dict.items(), key=lambda item: item[1], reverse=False)[-10:])
fig = px.bar(
    y=list(data_dict.keys()),
    x=list(data_dict.values()),
    labels={'y':'Guest', 'x':'Count'},
    orientation='h',
    text_auto=True)
fig.update_layout(title='Which guest came on the show the most? (Top 10)')
fig.show()

# What is the distribution of chunk token length? (Llama 2 Tokenizer)

In [11]:
chunks = []

for episode in data:
    transcripts = episode["transcripts"]
    if transcripts:
        for item in transcripts:
            chunks.append(item["sentencepiece_token_length"])
            
df = pd.DataFrame(chunks, columns=["x"])
fig = px.histogram(
    df,
    x="x",
    nbins=100, 
    labels={'x':'Token Length'}, 
    title="What is the distribution of chunk token length? (Llama 2 Tokenizer)")
fig.show()

# What is the distribution of episode length?

In [10]:
episode_lengths = []

for episode in data:
    if "episode_length" in episode.keys():
        episode_lengths.append(episode["episode_length"])
            
df = pd.DataFrame(episode_lengths, columns=["x"])
fig = px.histogram(
    df,
    x="x",
    nbins=20, 
    labels={'x':'Seconds'}, 
    title="What is the distribution of episode length?")
fig.show()