In [2]:
from tqdm.auto import tqdm
import json
from sentence_transformers import SentenceTransformer
#import pandas as pd

# Read input data from json file

In [3]:
file_name = 'podcasts_25k.json' #all data, 14700 podcasts after clearning
#file_name = 'podcasts.json' #sample data, 120 podcasts
with open(f'../data/processed/{file_name}', 'rt') as f_in:
   podcasts = json.load(f_in)

In [4]:
podcasts[13]

{'id': 'id1550677342',
 'name': 'Create Your Own Light',
 'url': 'https://podcasts.apple.com/us/podcast/create-your-own-light/id1550677342',
 'studio': 'Podcast with Travis Howze',
 'category': 'Arts',
 'episode_count': 129,
 'avg_rating': 4.8,
 'total_ratings': 161,
 'description': 'We all want to live a life we desire and deserve versus merely existing. Each week, join Travis Howze, U.S. Marine, former Police Officer and Firefighter, World Touring Stand Up Comedian, Motivational Speaker and Best Selling Author as he draws from a lifetime of experiences from unbelievably hysterical stories to unspeakable traumatic events and engages directly with his supporters and listeners, taking off-the-cuff questions and topic suggestions to produce a unique broadcast atmosphere, where you, the listener, has a say in the show. If you struggle with purpose, looking for inspiration, have a friend or loved one who could use support, or simply want to laugh and cry in your vehicle, couch, or go-to pl

# Create embeddings using Sentence Transformer

See all models leaderboard: https://sbert.net/docs/sentence_transformer/pretrained_models.html

In [7]:
model_name = 'multi-qa-MiniLM-L6-cos-v1' #best semantic search score, "dims": 384
#model_name = 'multi-qa-mpnet-base-dot-v1' #"dims": 768
#model_name = 'multi-qa-distilbert-cos-v1' #dims 512, medium size, medium speed
model = SentenceTransformer(model_name)
dims = len(model.encode("This is a simple sentence"))
len(model.encode("This is a simple sentence"))

384

In [5]:
podcasts[391]

{'id': 'id1585917387',
 'name': '特有想象\u202a力\u202c',
 'url': 'https://podcasts.apple.com/us/podcast/%E7%89%B9%E6%9C%89%E6%83%B3%E8%B1%A1%E5%8A%9B/id1585917387',
 'studio': '特有想象力',
 'category': 'Arts',
 'episode_count': 42,
 'avg_rating': 5.0,
 'total_ratings': 2,
 'description': '「特有想象力」Podcast，一档关于设计、创意、科技的闲聊播客。有趣，有干货，有能量。让创意回归人，让设计面向未来。官网：https://tyxxl.tech/微信公众号：特有想象力来各大音频平台找到我们'}

In [6]:
#podcasts#[podcasts['name']=='No Title']

In [7]:
#created the dense vector using the pre-trained model
operations = []
for doc in tqdm(podcasts):
    #print(doc["name"])
    # Transforming the title into an embedding using the model
    doc["description_vector"] = model.encode(doc["description"]).tolist()
    doc["name_vector"] = model.encode(doc["name"]).tolist()
    operations.append(doc)

  0%|          | 0/14700 [00:00<?, ?it/s]

In [8]:
podcasts[13].keys()

dict_keys(['id', 'name', 'url', 'studio', 'category', 'episode_count', 'avg_rating', 'total_ratings', 'description', 'description_vector', 'name_vector'])

# Setup ElasticSearch connection

In [9]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': '35e32a5c7ca8', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'PzHbLMFOSPq6qhCXe2DRSA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

# Create Mappings and Index

In [10]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "name": {"type": "keyword"} ,
            "id": {"type": "text"},
            "category": {"type": "text"},
            "studio": {"type": "text"},
            "episode_count": {"type": "integer"},
            "avg_rating": {"type": "float"},
            "total_ratings": {"type": "integer"},
            "url": {"type": "text"},
            "description": {"type": "text"},
            "description_vector": {"type": "dense_vector", "dims": dims, "index": True, "similarity": "cosine"},
            "name_vector": {"type": "dense_vector", "dims": dims, "index": True, "similarity": "cosine"},
        }
    }
}

In [11]:
index_name = "podcasts_"+model_name+"__dims_"+str(dims)
index_name = index_name.lower()
print("Index name:",index_name)
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

Index name: podcasts_multi-qa-distilbert-cos-v1__dims_768


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'podcasts_multi-qa-distilbert-cos-v1__dims_768'})

# Add documents into index

In [12]:
for doc in tqdm(podcasts):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

  0%|          | 0/14700 [00:00<?, ?it/s]

Connection timed out


# Keybased elastic search

In [13]:
def elastic_search(query, category):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["description^3", "name"],
                        "type": "best_fields"
                    }
                },
                # "filter": {
                #     "term": {
                #         "category": category
                #     }
                # }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [14]:
def recommend_docs_to_text(result_docs):
    for doc in result_docs:
        print(doc['name'])
       #print(doc['category'])
        print(doc['description'])
        print('\n')

In [15]:
result_docs = elastic_search(
    query="Podcast about Agatha Christie",
    category="Arts"
)
recommend_docs_to_text(result_docs)

All About Agatha Christie
All About Agatha is a podcast all about, well, Agatha. Agatha Christie, of course: the Queen of Crime, a real-life Dame of the British Empire, and author of sixty-six mystery novels that spanned the Twentieth Century, defining a genre. For five years, Catherine Brobeck and Kemper Donovan revisited these novels in publication order, ranking them according to pre-set criteria (plot, character, etc.). Tragically, Catherine Brobeck passed away at the end of 2021. Since then, Kemper has completed the podcast's ranking project, and now contents himself with celebrating the greatness of Christie by attempting to solve the ultimate mystery where she is concerned. Why Christie--and Christie alone--endures as powerfully as she does.


Agatha Christie Radio Plays
Agatha Christie is one of the most popular and prolific crime writers of all time, and her work has been adapted for radio and television countless times. These are just a few examples of the many radio adaptati

# Perform Keyword search with Semantic Search (Hybrid/Advanced Search)

In [16]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
         # "filter": {
                #     "term": {
                #         "category": category
                #     }
                # }
    }

    search_query = {
        "knn": knn,
        "_source": ["description","name"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [17]:
def description_vector_knn(query):
    v_q = model.encode(query)
    return elastic_search_knn('description_vector', v_q)

In [18]:
def name_vector_knn(query):
    v_q = model.encode(query)
    return elastic_search_knn('name_vector', v_q)

In [19]:
search_term = "Podcast about Agatha Christie"
result_docs = description_vector_knn(search_term)
recommend_docs_to_text(result_docs)

All About Agatha Christie
All About Agatha is a podcast all about, well, Agatha. Agatha Christie, of course: the Queen of Crime, a real-life Dame of the British Empire, and author of sixty-six mystery novels that spanned the Twentieth Century, defining a genre. For five years, Catherine Brobeck and Kemper Donovan revisited these novels in publication order, ranking them according to pre-set criteria (plot, character, etc.). Tragically, Catherine Brobeck passed away at the end of 2021. Since then, Kemper has completed the podcast's ranking project, and now contents himself with celebrating the greatness of Christie by attempting to solve the ultimate mystery where she is concerned. Why Christie--and Christie alone--endures as powerfully as she does.


That Witch Life
A Podcast on Living as a Witch in Today’s World


Haunted Detective
A true crime podcast that investigates the Macabre side of the world. Kelsey Childs and her friend Pamela J explore Paranormal mysteries nestled within the

In [20]:
search_term = "Podcast about Agatha Christie"
result_docs = name_vector_knn(search_term)

In [21]:
recommend_docs_to_text(result_docs)

The Salem Witch Trials Podcast
The Salem Witch Trials Podcast takes a fast-paced and episodic examination of the witch hysteria in Salem, Massachusetts in 1692. Each brief yet insightful episode harnesses the knowledge of an expert to help illuminate both the chronology of events as well the deeper context surrounding the hysteria, giving listeners an understanding of this fascinating and tragic event in early American history.


Agatha Christie Radio Plays
Agatha Christie is one of the most popular and prolific crime writers of all time, and her work has been adapted for radio and television countless times. These are just a few examples of the many radio adaptations of Agatha Christie's work. Her stories are perfectly suited for radio, with their complex plots, suspenseful twists and turns, and memorable characters. Christie's radio plays have been enjoyed by audiences for generations, and they continue to be popular today. They are a great way to experience her classic mysteries in 