In [1]:
import nltk
from nltk.corpus import stopwords, words
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
from lib.Episode import Episode, EpisodeFactory

CACHE_ALL = "./jre-episodes.pickle"
CACHE_SMALL = "./jre-episodes-small.pickle"
CACHE = CACHE_ALL

try:
    with open(CACHE, "rb") as f:
        episodes = pickle.load(f)
except (TypeError, FileNotFoundError) as e:
    print(f"Generating new pickle {CACHE}...", e)
    factory = EpisodeFactory("../data/jre")
    episodes = factory.create_episodes()
    with open(CACHE, "wb") as f:
        if CACHE == CACHE_SMALL:
            pickle.dump(episodes[:100], f)
        else:
            pickle.dump(episodes, f)

print(f"Number of loaded episodes: {len(episodes)}")


Generating new pickle ./jre-episodes.pickle... [Errno 2] No such file or directory: './jre-episodes.pickle'
loading comments...
	opened file...
Number of loaded episodes: 2428


In [4]:
# def clean_ep_text(ep):
#     stopwords = list(nltk.corpus.stopwords.words("english"))
#     stopwords.extend(
#         [
#             "'red",
#             "'s",
#             "'re",
#             "[ ]",
#             "n't",
#             "uh",
#             "um",
#             "like",
#             "yeah",
#             "__",
#             "ve",
#             "re",
#         ]
#     )
#     ps = nltk.stem.PorterStemmer()
#     stopwords = set(stopwords)
#     words = nltk.tokenize.word_tokenize(ep.text)
#     words = [ps.stem(w.strip().lower()) for w in words if w.strip().lower() not in stopwords]
#     return words
    
# def gen_tfidf(episodes):
#     cleaned = [(e.title, clean_ep_text(e)) for e in episodes if e.captions is not None]
#     cfd = nltk.ConditionalFreqDist((title, word) for title, words in cleaned for word in words)
#     words_in_cfd = set([w for document in cfd for w in cfd[document]])
#     # Map word to an index
#     word_lookup_table = {w:i for i, w in enumerate(words_in_cfd)}
#     # Map index to a word
#     word_lookup_table_inverse = {i:w for i, w in enumerate(words_in_cfd)}

#     def calc(cfd):
#         df = doc_freq_dict(cfd)
#         tfidf = {}
#         num_docs = len(cfd)        

#         # TODO This can be sped up by converting to np arrays
#         for document in tqdm(cfd):
#             tfidf[document] = np.zeros(len(words_in_cfd))

#             for w in df:
#                 tf = cfd[document][w] / (len(cfd[document]) + 1)
#                 idf = num_docs / (df[w] + 1)
#                 word_index = word_lookup_table[w]
#                 tfidf[document][word_index] = tf * np.log(idf)

#         return tfidf

#     # Compute document frequency for each word in CFD
#     def doc_freq_dict(cfd):
#         df = {}
#         for w in tqdm(words_in_cfd):
#             for document in cfd:
#                 if w in cfd[document]:
#                     df[w] = 1 + df.get(w, 0)

#         return df    
    
#     scores = calc(cfd)
#     return scores, word_lookup_table_inverse

In [2]:
from lib.TFIDF import TFIDF
from lib.utils import clean_text

corpus = [(e.title, clean_text(e.text)) for e in tqdm(episodes) if e.captions is not None]

100%|██████████| 2428/2428 [04:36&lt;00:00,  8.78it/s]
100%|██████████| 1583/1583 [03:16&lt;00:00,  8.07it/s]


TypeError: cannot unpack non-iterable NoneType object

In [4]:
tfidf = TFIDF()
tfidf.generate(corpus)

with open("tfidf-" + CACHE[2:], "wb") as f:
    pickle.dump(tfidf, f)

100%|██████████| 1583/1583 [03:21&lt;00:00,  7.85it/s]


In [5]:
tfidf.print_scores()

Joe Rogan Experience #1539 - Jenny Kleeman
meat 0.05399
technolog 0.03822
robot 0.03058
human 0.02154
vegan 0.02004
anim 0.01972
be 0.0182
eat 0.01751
doll 0.01735
sex 0.01609

Joe Rogan Experience #1538 - Douglas Murray
tran 0.02655
societi 0.01957
complianc 0.01786
women 0.01668
motherhood 0.01568
gender 0.01472
dishonest 0.01466
anti-fascist 0.01412
covid 0.014
jk 0.01389

Joe Rogan Experience #1537 - Lex Fridman
neurolink 0.02166
comment 0.0204
technolog 0.0204
ant 0.01834
hendrix 0.01758
clapton 0.01694
wolfram 0.01632
jimi 0.01475
eric 0.01446
pull-up 0.01441

Joe Rogan Experience #1536 - Edward Snowden
pardon 0.04832
assang 0.03012
court 0.02986
govern 0.0279
julian 0.0247
surveil 0.02445
warrant 0.021
patriot 0.02084
polic 0.01991
espionag 0.01903

Joe Rogan Experience #1535 - Tim Kennedy
insurg 0.02214
trump 0.01752
antifa 0.01714
hawaiian 0.01671
protest 0.01634
biden 0.01609
polic 0.01585
defund 0.01516
elect 0.01425
crow 0.01243

Joe Rogan Experience #1534 - Ron White
coman

In [7]:
# def print_scores(score_list):
#     score_tuples = [(word_lookup_table[i], score) for i, score in enumerate(score_list)]
#     score_tuples = sorted(score_tuples, key=lambda x: x[1], reverse=True)
#     for word, score in score_tuples[:25]:
#         print(word, round(score, ndigits=5))    

# for title, score_list in list(scores.items())[:10]:
#     print(title)    
#     print_scores(score_list)
#     print()

Joe Rogan Experience #1539 - Jenny Kleeman
meat 0.05391
technolog 0.03817
robot 0.03054
human 0.02151
vegan 0.02001
anim 0.01969
be 0.01817
eat 0.01748
doll 0.01732
sex 0.01606
artifici 0.01341
argument 0.01291
conscious 0.01227
farm 0.01167
neurolink 0.01135
lip 0.0111
carnivor 0.0107
inevit 0.01037
cell 0.00959
futur 0.00953
uk 0.00943
lab 0.0094
tast 0.00925
calori 0.00913
fish 0.00909

Joe Rogan Experience #1538 - Douglas Murray
tran 0.02651
societi 0.01954
complianc 0.01783
women 0.01665
motherhood 0.01566
gender 0.01469
dishonest 0.01464
anti-fascist 0.0141
covid 0.01398
rowl 0.01387
jk 0.01387
portland 0.01371
plagu 0.01214
homophob 0.01056
discuss 0.0101
revolut 0.00996
ichabog 0.00964
hostag 0.0095
america 0.0094
suppress 0.00919
pretend 0.00916
subjug 0.00911
demor 0.00911
kenosha 0.00905
racism 0.009

Joe Rogan Experience #1537 - Lex Fridman
neurolink 0.02164
comment 0.02038
technolog 0.02037
ant 0.01832
hendrix 0.01755
clapton 0.01692
wolfram 0.0163
jimi 0.01473
eric 0.0144

In [152]:
def cosine_similarity(a, b):
    dist = lambda tfidf: np.sqrt(np.sum(tfidf * tfidf))
    dot_prod = np.dot(a, b)
    distances = dist(a) * dist(b)
    return dot_prod / distances

In [157]:
# Probably the dumbest way to do this
index_of_ep = lambda title: episodes.index([e for e in episodes if title == e.title][0])
name_to_index = {name:index_of_ep(name) for name in scores}
cos_sim_matrix = np.zeros((len(episodes), len(episodes)))
cos_sim_matrix_ep_names = np.zeros((len(episodes), len(episodes)), tuple)

for a, b in tqdm([(a, b) for a in scores for b in scores]):
    ai = name_to_index[a]
    bi = name_to_index[b]
    # Only fill half of the matrix
    if bi > ai:
        continue
    cos_sim_matrix[ai][bi] = cosine_similarity(scores[a], scores[b])
    cos_sim_matrix_ep_names[ai][bi] = (a, b)

100%|██████████| 2505889/2505889 [05:06<00:00, 8182.45it/s]


In [158]:
print(cos_sim_matrix[:3])

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.1602899  1.         0.         ... 0.         0.         0.        ]
 [0.24411226 0.17694882 1.         ... 0.         0.         0.        ]]


In [196]:
cos_sim_list = []

# Format as (index, index), similarity
for row, col in tqdm([(r, c) for r, _ in enumerate(cos_sim_matrix) for c, _ in enumerate(cos_sim_matrix)]):
    # Make sure inverse (b, a) isn't in the list already
    if col >= row:
        continue
    cos_sim_list.append(((row, col), cos_sim_matrix[row][col]))

100%|██████████| 5895184/5895184 [00:03<00:00, 1791498.08it/s]


In [198]:
print("Most similar podcast episodes")
print("=============================\n")
cos_sim_list = sorted(cos_sim_list, key=lambda x: x[1], reverse=True)
for (a, b), score in cos_sim_list[:50]:
    a_name, b_name = cos_sim_matrix_ep_names[a][b]
    print(a_name)
    print(b_name)
    print(f"\t{round(score, 4) * 100}%")
    print()

Most similar podcast episodes

Joe Rogan Experience #94 - Joey Diaz (Part 2)
Joe Rogan Experience #94 - Joey Diaz (Part 3)
	87.72999999999999%

Joe Rogan Experience - Best of The Week - May 5, 2013
Best of The Week - May 5, 2013 - Joe Rogan Experience
	81.65%

Bert Kreischer is The Machine (from Joe Rogan Experience #95)
Best of Bert Kreischer - Joe Rogan Experience - Volume 1
	78.83%

Bill Burr's trip to Indianapolis (from Joe Rogan Experience #343)
"Just Keep Showing Up" with Joey Diaz (from Joe Rogan Experience #513)
	73.45%

Joe Rogan Experience #1002 - Peter Schiff
Joe Rogan Experience #1145 - Peter Schiff
	73.16%

Joe Rogan Experience #1302 - Ed Calderon
Joe Rogan Experience #1408 - Ed Calderon
	72.68%

Bill Burr's trip to Indianapolis (from Joe Rogan Experience #343)
Best of Joey Diaz - Joe Rogan Experience - Volume 3
	72.27%

Best of Joey Diaz - Joe Rogan Experience - Volume 3
"Just Keep Showing Up" with Joey Diaz (from Joe Rogan Experience #513)
	71.73%

Joe Rogan Experience #