Data collection 
1. TFIDF For all episodes
2. Cosine similarity
3. Most spoken words

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import json

In [3]:
WEBSITE = "../data/jre/website/"

In [4]:
CACHE = "./jre-episodes.pickle"

with open(CACHE, "rb") as f:
    episodes = pickle.load(f)

print(f"Number of loaded episodes: {len(episodes)}")

KeyboardInterrupt: 

In [None]:
from lib.TFIDF import TFIDF
from lib.utils import clean_text

cleaned_corpus = [(e, clean_text(e.text)) for e in tqdm(episodes) if e.captions is not None]
# s2w is the stem 2 word dictionary (saved in a later cell)
corpus = [(ep, cleaned) for ep, (cleaned, s2w) in cleaned_corpus]

In [None]:
tfidf = TFIDF()
tfidf.generate(corpus)

with open("tfidf-" + CACHE[2:], "wb") as f:
    pickle.dump(tfidf, f)

In [None]:
with open("tfidf-" + CACHE[2:], "rb") as f:
    tfidf = pickle.load(f)

In [None]:
tfidf.print_scores()

## Cosine Sim

In [None]:
def cosine_similarity(a, b):
    dist = lambda tfidf: np.sqrt(np.sum(tfidf * tfidf))
    dot_prod = np.dot(a, b)
    distances = dist(a) * dist(b)
    return dot_prod / distances

In [None]:
# Probably the dumbest way to do this
tfidf_titles = [e.title for e in tfidf.scores.keys()]
index_of_ep = lambda ep: tfidf_titles.index(ep.title)

# (ep1, ep2), score
cos_sim_matrix = np.zeros((len(episodes), len(episodes)), tuple)

for a, b in tqdm([(a, b) for a in tfidf.scores for b in tfidf.scores]):
    ai = index_of_ep(a)
    bi = index_of_ep(b)
    # Only fill half of the matrix
    if bi > ai:
        continue
    cos_sim_matrix[ai][bi] = ((a, b), cosine_similarity(tfidf.scores[a], tfidf.scores[b]))

In [None]:
print(cos_sim_matrix[:3]), len(cos_sim_matrix)

In [None]:
cos_sim_list = []

# Format as (index, index), similarity
for row in cos_sim_matrix:
    for item in row:
        if item == 0: continue
        (a, b), score = item
        if a == b: continue
        cos_sim_list.append((a, b, score))

In [None]:
print("Most similar podcast episodes")
print("=============================\n")
cos_sim_list = sorted(cos_sim_list, key=lambda x: x[2], reverse=True)
for a, b, score in cos_sim_list[:50]:
    if a.is_main_episode and b.is_main_episode:
        print(a)
        print(b)
        print(f"\t{round(score, 4) * 100}%")
        print()

In [None]:
cos_sim_table = pd.DataFrame(
    [(a.video_id, b.video_id, s) for a,b,s in cos_sim_list], 
    columns=["id1", "id2", "similarity"],
)
cos_sim_table.to_csv(WEBSITE + "episode_similarity.csv")

#### To protobufs
63mb -> 

In [None]:
# protoc --python_out=./ ./episode-sim.proto
# protoc --js_out=../../jre-vis/src/lib/proto ./episode-sim.proto
import episode_sim_pb2 as ep_proto

In [None]:
cos_sim_table = pd.read_csv(WEBSITE + "episode_similarity.csv")

ids = cos_sim_table["id1"].append(cos_sim_table["id2"]).unique()
IDs = ep_proto.IDs()
for i, id in enumerate(ids):
    row = IDs.rows.add()
    row.idNum = i
    row.id = id
    
epSims = ep_proto.EpisodeSims()
for index, row in tqdm(cos_sim_table.iterrows()):
    e = epSims.rows.add()
    e.similarity = row["similarity"]
    e.idNum1 = [i for i, id in enumerate(ids) if id == row["id1"]][0]
    e.idNum2 = [i for i, id in enumerate(ids) if id == row["id2"]][0]


In [None]:
PROTO_OUT = "../../jre-vis/public/"

with open(PROTO_OUT + "ep_sim", "wb") as f:
    f.write(epSims.SerializeToString())
    
with open(PROTO_OUT + "ep_sim_id_lookup", "wb") as f:
    f.write(IDs.SerializeToString())

## Store top word occurrences of each episode

In [None]:
def get_num(e):
    # TODO
    # unsure why, but the tfidf ep #1564 and #1563 in the cfd
    # are strings and not actual episodes
    try:
        x = e[0].number if e[0].number is not None else -1
        return x
    except Exception as x:
        return -1
    
cfd_items = sorted(list(tfidf.cfd.items()), key=get_num, reverse=True)
cfd_table = pd.DataFrame(
    [(k.video_id, dict(v.most_common(400))) for k, v in cfd_items],
    columns=["id", "top_words"],
)

cfd_table.to_csv(WEBSITE + "word_occurrences.csv")

## Reverse Stem Data

In [None]:
reverse_stem = {}
for title, (cleaned, s2w) in cleaned_corpus:
    reverse_stem.update(s2w)
    
rm_stem = lambda stem, w: w if len(stem) == len(w) else w[len(stem):]
reverse_stem = {stem: [rm_stem(stem, w) for w in words] for stem, words in reverse_stem.items()}

# Remove items with 1 element that is the exact same as the stem
reverse_stem = {stem: words for stem, words in reverse_stem.items() if len(words) != 1 or words[0] != stem}

# Remove words that are the exact same as the stem
reverse_stem = {stem: [w for w in words if w != stem] for stem, words in reverse_stem.items()}
    
with open(WEBSITE + "reverse_stem.json", "w") as f:
    f.write(json.dumps(reverse_stem))