Data collection 
1. TFIDF For all episodes
2. Cosine similarity
3. Most spoken words

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
from lib.Episode import Episode, EpisodeFactory
import json

In [33]:
WEBSITE = "../data/jre/website/"

In [10]:
CACHE = "./jre-episodes.pickle"

with open(CACHE, "rb") as f:
    episodes = pickle.load(f)

print(f"Number of loaded episodes: {len(episodes)}")

Number of loaded episodes: 2455


In [5]:
from lib.TFIDF import TFIDF
from lib.utils import clean_text

cleaned_corpus = [(e, clean_text(e.text)) for e in tqdm(episodes) if e.captions is not None]
# s2w is the stem 2 word dictionary (saved in a later cell)
corpus = [(ep, cleaned) for ep, (cleaned, s2w) in cleaned_corpus]

100%|██████████| 2455/2455 [04:30<00:00,  9.08it/s]


In [6]:
tfidf = TFIDF()
tfidf.generate(corpus)

with open("tfidf-" + CACHE[2:], "wb") as f:
    pickle.dump(tfidf, f)

100%|██████████| 70059/70059 [00:25<00:00, 2758.40it/s]
100%|██████████| 1605/1605 [03:22<00:00,  7.94it/s]


In [7]:
tfidf.print_scores()

[5PrLGhJnO7I] Joe Rogan Experience #1562 - Dave Smith
biden 0.05818
trump 0.04458
donald 0.02456
war 0.02365
eisenhow 0.01875
kamala 0.0173
iraq 0.01719
berni 0.01524
sander 0.01477
presid 0.01458

[C8M1ZRYt-2Q] Joe Rogan Experience #1561 - Kermit Pattison
skeleton 0.11077
arti 0.07024
fossil 0.06021
speci 0.05503
canin 0.05235
luci 0.04657
ethiopia 0.04222
ancestor 0.03516
neanderth 0.03393
geolog 0.03041

[surnFz_pZE4] Joe Rogan Experience #1559 - Steven Rinella
mammoth 0.01885
biden 0.01734
d-day 0.01474
wolv 0.01438
hunt 0.01364
baculum 0.01355
fossil 0.01321
overproduc 0.01263
bear 0.0122
bone 0.01207

[KkjxSKrcbOg] JRE End Of The World #2
biden 0.08188
pennsylvania 0.06654
trump 0.06647
vote 0.04906
poll 0.03987
mail-in 0.02748
win 0.02549
florida 0.02362
elect 0.02322
ohio 0.02006

[OaTKaHKCAFg] Joe Rogan Experience #1558 - Tristan Harris
facebook 0.0467
attent 0.02281
social 0.02155
algorithm 0.01919
voodoo 0.01893
huxley 0.01873
orwel 0.0171
dilemma 0.01542
media 0.01538
ukule

## Cosine Sim

In [8]:
def cosine_similarity(a, b):
    dist = lambda tfidf: np.sqrt(np.sum(tfidf * tfidf))
    dot_prod = np.dot(a, b)
    distances = dist(a) * dist(b)
    return dot_prod / distances

In [18]:
# Probably the dumbest way to do this
tfidf_titles = [e.title for e in tfidf.scores.keys()]
index_of_ep = lambda ep: tfidf_titles.index(ep.title)

# (ep1, ep2), score
cos_sim_matrix = np.zeros((len(episodes), len(episodes)), tuple)

for a, b in tqdm([(a, b) for a in tfidf.scores for b in tfidf.scores]):
    ai = index_of_ep(a)
    bi = index_of_ep(b)
    # Only fill half of the matrix
    if bi > ai:
        continue
    cos_sim_matrix[ai][bi] = ((a, b), cosine_similarity(tfidf.scores[a], tfidf.scores[b]))

100%|██████████| 2576025/2576025 [04:55<00:00, 8703.71it/s]  


In [21]:
print(cos_sim_matrix[:3]), len(cos_sim_matrix)

[[(([5PrLGhJnO7I] Joe Rogan Experience #1562 - Dave Smith, [5PrLGhJnO7I] Joe Rogan Experience #1562 - Dave Smith), 1.0)
  0 0 ... 0 0 0]
 [(([C8M1ZRYt-2Q] Joe Rogan Experience #1561 - Kermit Pattison, [5PrLGhJnO7I] Joe Rogan Experience #1562 - Dave Smith), 0.07299247913616964)
  (([C8M1ZRYt-2Q] Joe Rogan Experience #1561 - Kermit Pattison, [C8M1ZRYt-2Q] Joe Rogan Experience #1561 - Kermit Pattison), 1.0000000000000002)
  0 ... 0 0 0]
 [(([surnFz_pZE4] Joe Rogan Experience #1559 - Steven Rinella, [5PrLGhJnO7I] Joe Rogan Experience #1562 - Dave Smith), 0.27197056928850144)
  (([surnFz_pZE4] Joe Rogan Experience #1559 - Steven Rinella, [C8M1ZRYt-2Q] Joe Rogan Experience #1561 - Kermit Pattison), 0.15207300252929887)
  (([surnFz_pZE4] Joe Rogan Experience #1559 - Steven Rinella, [surnFz_pZE4] Joe Rogan Experience #1559 - Steven Rinella), 1.0000000000000002)
  ... 0 0 0]]


(None, 2455)

In [28]:
cos_sim_list = []

# Format as (index, index), similarity
for row in cos_sim_matrix:
    for item in row:
        if item == 0: continue
        if a != b:
        (a, b), score = item
        cos_sim_list.append((a, b, score))

In [31]:
print("Most similar podcast episodes")
print("=============================\n")
cos_sim_list = sorted(cos_sim_list, key=lambda x: x[2], reverse=True)
for a, b, score in cos_sim_list[:50]:
    if a.is_main_episode and b.is_main_episode and a != b:
        print(a)
        print(b)
        print(f"\t{round(score, 4) * 100}%")
        print()

Most similar podcast episodes

[c6bjKNHVS8M] Joe Rogan Experience #94 - Joey Diaz (Part 2)
[dqMH6h2W8_Q] Joe Rogan Experience #94 - Joey Diaz (Part 3)
	88.62%

[llhb2ymtsw8] Joe Rogan Experience #1302 - Ed Calderon
[xPBejhoKlb8] Joe Rogan Experience #1408 - Ed Calderon
	80.91000000000001%

[by1OgqQQANg] Joe Rogan Experience #1002 - Peter Schiff
[3u7kDfEtKfs] Joe Rogan Experience #1145 - Peter Schiff
	76.68%

[keSoSyu9m7c] Joe Rogan Experience #994 - Dom D'Agostino
[u93oh9kC-rU] Joe Rogan Experience #1176 - Dom D'Agostino & Layne Norton
	71.14%

[Dq4Apc2Xk7Q] Joe Rogan Experience #1389 - Chris Kresser Debunks "The Gamechangers" Documentary
[s0zgNY_kqlI] Joe Rogan Experience #1393 - James Wilks & Chris Kresser - The Game Changers Debate
	67.94%

[mPqWstVnRjQ] Joe Rogan Experience #1035 - Paul Stamets
[xJ6Ym719urg] Joe Rogan Experience #1385 - Paul Stamets
	67.75%

[ZtxzMb9CpTM] Joe Rogan Experience #1151 - Sean Carroll
[TP5W2MG8Jjs] Joe Rogan Experience #1352 - Sean Carroll
	67.24%

[fE3

In [35]:
cos_sim_table = pd.DataFrame(
    [(a.video_id, b.video_id, s) for a,b,s in cos_sim_list], 
    columns=["id1", "id2", "similarity"],
)
cos_sim_table.to_csv(WEBSITE + "cosine_similarity.csv")

## CFD Episodes in bins of 10 and by themselves

In [None]:
tfidf.cfd[episodes[0].title]

## Reverse Stem Data

In [None]:
reverse_stem = {}
for title, (cleaned, s2w) in cleaned_corpus:
    reverse_stem.update(s2w)
reverse_stem = {stem: list(words) for stem, words in reverse_stem.items()}
    
with open(WEBSITE + "reverse_stem.json", "w") as f:
    f.write(json.dumps(reverse_stem))