# Get and cluster podcasts descriptions embeddings

## Prepare environment

In [None]:
!pip install -q pickle5 podcast-api html2text sentence-transformers

[K     |████████████████████████████████| 133kB 5.1MB/s 
[K     |████████████████████████████████| 389kB 8.4MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 81kB 6.3MB/s 
[K     |████████████████████████████████| 2.5MB 10.0MB/s 
[K     |████████████████████████████████| 1.2MB 47.5MB/s 
[K     |████████████████████████████████| 901kB 41.2MB/s 
[K     |████████████████████████████████| 3.3MB 41.0MB/s 
[?25h  Building wheel for podcast-api (PEP 517) ... [?25l[?25hdone
  Building wheel for pickle5 (setup.py) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Coursework

/content/drive/MyDrive/Coursework


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

emb_model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

HBox(children=(FloatProgress(value=0.0, max=1014059776.0), HTML(value='')))




In [None]:
import re
import os
import pickle5 as pickle
import pandas as pd

from copy import copy
from tqdm.notebook import tqdm
from listennotes import podcast_api
from credentials import listennotes_key

client = podcast_api.Client(api_key=listennotes_key)

In [None]:
import re
import html2text

h = html2text.HTML2Text()

h.unicode_snob = True
h.use_automatic_links = True
h.ignore_links = False
h.emphasis_mark = ""
h.strong_mark = ""

def dehtmlify(qhtml, h=h, despacify=True):
    res = h.handle(qhtml)
    res = re.sub(r"\[.*?\(javascript:void\\\(0\\\)\).*?\]", " ", res)
    res = re.sub(r":(\W)", r": \1", res)
    res = re.sub(r"!\[\]\((.*?)\)", r"\1", res)
    res = re.sub(r"(https?:)\s", r"\1", res)
    res = res.replace("\.", ".")
    if despacify:
      res = re.sub(r"\s+", " ", res).strip()
    return res

## Get descriptions

In [None]:
with open("jingles_df.pkl", "rb") as inp:
  jingles_df = pickle.load(inp)
jselect_df = jingles_df.loc[jingles_df["j_status"].isin(["+", "2", "!!", 2])].reset_index(drop=True)
jdf = pd.read_csv("collection_final.csv")
jdf = jselect_df.loc[[list(jselect_df["title"]).index(t) for t in list(jdf["title"])]].reset_index(drop=True)
jdf

Unnamed: 0,j_status,title,category_ln,category_yp,ln_id
0,+,Маркетинговая рубка,Бизнес,Бизнес и работа,918b1878324748feb228a8a1b5f917ee
1,+,Богема и Маркетинг,Бизнес,Бизнес и работа,70deb321ab2a4375abd376186a7ce977
2,+,Взяла и сделала,Бизнес,Бизнес и работа,8095bace4866436daff4a2519472d8b1
3,+,Нормальные деньги,Бизнес,Бизнес и работа,cc2ae7ae652f457da662efb1adb53a0d
4,+,ПЛЮС-МИНУС ДЕНЬГИ,Бизнес,Бизнес и работа,f876739f37724fc3bd8241bf008b9c28
...,...,...,...,...,...
86,+,Вне игры,Спорт,Спорт,b7338a9e056a4ec58be5c173b2ee9e42
87,+,Наш Чемп,Спорт,Спорт,d1095ebdbbe54fb986165dbc926eeb64
88,+,Дешифратор,Спорт,Спорт,78159c5340b84f2eb3e1f2e9cb4796a1
89,+,Футбольный механизм,Спорт,Спорт,356a490328ff45bdb4b05965ce92b132


In [None]:
#!mkdir jingles

dstruct = {}

for pid in tqdm(list(jdf["ln_id"])):

  response = client.fetch_podcast_by_id(
    id = pid,
    sort = 'recent_first'
  )
  rjson = response.json()

  descriptions = list(set([dehtmlify(rjson["description"])] + [dehtmlify(e["description"]) for e in rjson["episodes"]]))
  dstruct[pid] = copy(descriptions)

HBox(children=(FloatProgress(value=0.0, max=91.0), HTML(value='')))




In [None]:
with open("dstruct.pkl", "wb") as outp:
  pickle.dump(dstruct, outp)

## Get embedding centres

In [None]:
embs = [np.mean(emb_model.encode(dstruct[pid]), axis=0) for pid in tqdm(list(jdf["ln_id"]))]

HBox(children=(FloatProgress(value=0.0, max=91.0), HTML(value='')))




## Get clusters

In [None]:
from sklearn.cluster import KMeans

clusterer = KMeans(n_clusters=8, random_state=42)
clusterer.fit(embs)

labels = clusterer.labels_
labels

array([2, 2, 1, 2, 2, 1, 4, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 3,
       3, 1, 4, 1, 3, 1, 1, 5, 0, 7, 2, 3, 7, 3, 3, 3, 0, 7, 0, 7, 0, 6,
       7, 6, 7, 5, 0, 0, 6, 2, 3, 6, 6, 6, 5, 2, 6, 3, 6, 2, 6, 3, 3, 0,
       1, 3, 2, 3, 6, 6, 2, 3, 2, 1, 2, 5, 5, 5, 1, 4, 4, 0, 1, 5, 4, 4,
       5, 4, 5], dtype=int32)

In [None]:
df = pd.read_csv("collection_final.csv")
df["embeddings_clusters"] = labels
df.to_csv("collection_final.csv", index=False)