Data collection 
1. TFIDF For all episodes
2. Cosine similarity
3. Most spoken words

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import json

In [3]:
WEBSITE = "../../jre-vis/public/"
N_TOP_WORD_OCCURRENCES = 400

In [4]:
CACHE = "./jre-episodes.pickle"

with open(CACHE, "rb") as f:
    episodes = pickle.load(f)

print(f"Number of loaded episodes: {len(episodes)}")

Number of loaded episodes: 2462


In [5]:
from lib.TFIDF import TFIDF
from lib.utils import clean_text

cleaned_corpus = [(e, clean_text(e.text)) for e in tqdm(episodes) if e.captions is not None]
# s2w is the stem 2 word dictionary (saved in a later cell)
corpus = [(ep, cleaned) for ep, (cleaned, s2w) in cleaned_corpus]

100%|██████████| 2462/2462 [04:42<00:00,  8.72it/s]


In [6]:
tfidf = TFIDF()
tfidf.generate(corpus)

with open("tfidf-" + CACHE[2:], "wb") as f:
    pickle.dump(tfidf, f)

100%|██████████| 70150/70150 [00:26<00:00, 2618.04it/s]
100%|██████████| 1608/1608 [03:20<00:00,  8.01it/s]


In [7]:
with open("tfidf-" + CACHE[2:], "rb") as f:
    tfidf = pickle.load(f)

In [8]:
tfidf.print_scores()

[C7t_LxpzYTg] Joe Rogan Experience #1567 - Donnell Rawlings & Dave Chappelle
soy 0.02732
biden 0.02731
drone 0.02472
vaccin 0.02416
yo 0.02319
dog 0.02119
trump 0.02069
edamam 0.01957
cook 0.01912
elk 0.01854

[Sc5oa6MqPDg] Joe Rogan Experience #1566 - Nicholas Christakis
vaccin 0.10879
viru 0.10291
pandem 0.06037
infect 0.04149
immun 0.04109
mask 0.03536
dexamethason 0.03439
coronaviru 0.02873
incident 0.02562
antibodi 0.0239

[GmHwG2p_esE] Joe Rogan Experience #1565 - Gary Laderman
religion 0.05508
religi 0.03545
psychedel 0.03242
lsd 0.02355
topic 0.02197
sacr 0.02021
drug 0.0191
sexual 0.0156
pornographi 0.01486
student 0.01453

[5PrLGhJnO7I] Joe Rogan Experience #1562 - Dave Smith
biden 0.05802
trump 0.04451
donald 0.0245
war 0.02359
eisenhow 0.01875
iraq 0.0172
kamala 0.01718
berni 0.01522
sander 0.01475
presid 0.01453

[C8M1ZRYt-2Q] Joe Rogan Experience #1561 - Kermit Pattison
skeleton 0.11084
arti 0.07028
fossil 0.06005
speci 0.0549
canin 0.05237
luci 0.0466
ethiopia 0.04224
an

## Cosine Sim

In [9]:
def cosine_similarity(a, b):
    dist = lambda tfidf: np.sqrt(np.sum(tfidf * tfidf))
    dot_prod = np.dot(a, b)
    distances = dist(a) * dist(b)
    return dot_prod / distances

In [10]:
def get_cos_sim_matrix():
    # Probably the dumbest way to do this
    tfidf_titles = [e.title for e in tfidf.scores.keys()]
    index_of_ep = lambda ep: tfidf_titles.index(ep.title)

    # (ep1, ep2), score
    cos_sim_matrix = np.zeros((len(episodes), len(episodes)), tuple)

    for a, b in tqdm([(a, b) for a in tfidf.scores for b in tfidf.scores]):
        ai = index_of_ep(a)
        bi = index_of_ep(b)
        # Only fill half of the matrix
        if bi > ai:
            continue
        cos_sim_matrix[ai][bi] = ((a, b), cosine_similarity(tfidf.scores[a], tfidf.scores[b]))
        
    return cos_sim_matrix

def convert_matrix_to_list(cos_sim_matrix):
    cos_sim_list = []

    # Format as (index, index), similarity
    for row in cos_sim_matrix:
        for item in row:
            if item == 0: continue
            (a, b), score = item
            if a == b: continue
            cos_sim_list.append((a, b, score))
            
    return cos_sim_list

def print_most_similar(cos_sim_list):
    print("Most similar podcast episodes")
    print("=============================\n")
    cos_sim_list = sorted(cos_sim_list, key=lambda x: x[2], reverse=True)
    for a, b, score in cos_sim_list[:50]:
        if a.is_main_episode and b.is_main_episode:
            print(a)
            print(b)
            print(f"\t{round(score, 4) * 100}%")
            print()
            
def to_cos_sim_table(cos_sim_list):
    return pd.DataFrame(
        [(a.video_id, b.video_id, s) for a,b,s in cos_sim_list], 
        columns=["id1", "id2", "similarity"],
    )
    

In [11]:
LOAD_COS_SIM = False

if LOAD_COS_SIM:
    with open("cos-sim-" + CACHE[2:], "rb") as f:
        cos_sim_table = pickle.loads(f)
else:
    cos_sim_matrix = get_cos_sim_matrix()
    
    print(cos_sim_matrix[:3], len(cos_sim_matrix))
    
    cos_sim_list = convert_matrix_to_list(cos_sim_matrix)
    
    print_most_similar(cos_sim_list)
    
    cos_sim_table = to_cos_sim_table(cos_sim_list)
    
    with open("cos-sim-" + CACHE[2:], "wb") as f:
        pickle.dump(cos_sim_table, f)

100%|██████████| 2585664/2585664 [05:33<00:00, 7760.52it/s] 


[[(([C7t_LxpzYTg] Joe Rogan Experience #1567 - Donnell Rawlings & Dave Chappelle, [C7t_LxpzYTg] Joe Rogan Experience #1567 - Donnell Rawlings & Dave Chappelle), 1.0000000000000004)
  0 0 ... 0 0 0]
 [(([Sc5oa6MqPDg] Joe Rogan Experience #1566 - Nicholas Christakis, [C7t_LxpzYTg] Joe Rogan Experience #1567 - Donnell Rawlings & Dave Chappelle), 0.23829795519589023)
  (([Sc5oa6MqPDg] Joe Rogan Experience #1566 - Nicholas Christakis, [Sc5oa6MqPDg] Joe Rogan Experience #1566 - Nicholas Christakis), 1.0)
  0 ... 0 0 0]
 [(([GmHwG2p_esE] Joe Rogan Experience #1565 - Gary Laderman, [C7t_LxpzYTg] Joe Rogan Experience #1567 - Donnell Rawlings & Dave Chappelle), 0.13163386734009325)
  (([GmHwG2p_esE] Joe Rogan Experience #1565 - Gary Laderman, [Sc5oa6MqPDg] Joe Rogan Experience #1566 - Nicholas Christakis), 0.11622489440962952)
  (([GmHwG2p_esE] Joe Rogan Experience #1565 - Gary Laderman, [GmHwG2p_esE] Joe Rogan Experience #1565 - Gary Laderman), 1.0)
  ... 0 0 0]] 2462
Most similar podcast episo

#### To protobufs
63mb -> 

In [12]:
# protoc --python_out=./ ./episode-sim.proto
# pbf ./episode-sim.proto --browser > ../../jre-vis/public/
import episode_sim_pb2 as ep_proto

In [None]:
ids = cos_sim_table["id1"].append(cos_sim_table["id2"]).unique()
IDs = ep_proto.IDs()
for i, id in enumerate(ids):
    row = IDs.rows.add()
    row.idNum = i
    row.id = id
    
epSims = ep_proto.EpisodeSims()
for index, row in tqdm(cos_sim_table.iterrows()):
    e = epSims.rows.add()
    e.similarity = row["similarity"]
    e.idNum1 = [i for i, id in enumerate(ids) if id == row["id1"]][0]
    e.idNum2 = [i for i, id in enumerate(ids) if id == row["id2"]][0]


200904it [33:42, 102.75it/s]

In [None]:
with open(WEBSITE + "ep_sim", "wb") as f:
    f.write(epSims.SerializeToString())
    
with open(WEBSITE + "ep_sim_id_lookup", "wb") as f:
    f.write(IDs.SerializeToString())

## Store top word occurrences of each episode

In [None]:
def get_num(e):
    # TODO
    # unsure why, but the tfidf ep #1564 and #1563 in the cfd
    # are strings and not actual episodes
    try:
        x = e[0].number if e[0].number is not None else -1
        return x
    except Exception as x:
        return -1
    
cfd_items = sorted(list(tfidf.cfd.items()), key=get_num, reverse=True)
cfd_table = pd.DataFrame(
    [(k.video_id, dict(v.most_common(N_TOP_WORD_OCCURRENCES))) for k, v in cfd_items],
    columns=["id", "top_words"],
)

cfd_table.to_csv(WEBSITE + "word_occurrences.csv")

## Reverse Stem Data

In [None]:
reverse_stem = {}
for title, (cleaned, s2w) in cleaned_corpus:
    reverse_stem.update(s2w)
    
rm_stem = lambda stem, w: w if len(stem) == len(w) else w[len(stem):]
reverse_stem = {stem: [rm_stem(stem, w) for w in words] for stem, words in reverse_stem.items()}

# Remove items with 1 element that is the exact same as the stem
reverse_stem = {stem: words for stem, words in reverse_stem.items() if len(words) != 1 or words[0] != stem}

# Remove words that are the exact same as the stem
reverse_stem = {stem: [w for w in words if w != stem] for stem, words in reverse_stem.items()}
    
with open(WEBSITE + "reverse_stem.json", "w") as f:
    f.write(json.dumps(reverse_stem))