# SemMovie (Semantic Movie Search)


*   Search any movie just by keyword
*   Datasets : [Latest Netflix TV Show and Movies](https://www.kaggle.com/datasets/senapatirajesh/netflix-tv-shows-and-movies/data)



# Install and import necessary libraries

In [5]:
!pip install opendatasets sentence-transformers langchain langchain-community chromadb --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.4/337.4 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.5/127.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [10]:
# usual lib
import opendatasets as od
import pandas as pd
import numpy as np

#langchain & RAG
from langchain.document_loaders import CSVLoader, DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings # opensource ftw
from langchain.vectorstores import Chroma

# Downloading datasets

In [11]:
od.download('https://www.kaggle.com/datasets/senapatirajesh/netflix-tv-shows-and-movies/data')

Skipping, found downloaded files in "./netflix-tv-shows-and-movies" (use force=True to force download)


In [13]:
df = pd.read_csv('/content/netflix-tv-shows-and-movies/NetFlix.csv')[['title', 'description']]
df

Unnamed: 0,title,description
0,3%,In a future where the elite inhabit an island ...
1,1920,An architect and his wife move into a castle t...
2,3 Heroines,Three Indonesian women break records by becomi...
3,Blue Mountain State: The Rise of Thadland,New NFL star Thad buys his old teammates' belo...
4,Blue Planet II,This sequel to the award-winning nature series...
...,...,...
7782,Blown Away,Ten master artists turn up the heat in glassbl...
7783,Blue Exorcist,Determined to throw off the curse of being Sat...
7784,Blue Is the Warmest Color,"Determined to fall in love, 15-year-old Adele ..."
7785,Blue Jasmine,The high life leads to high anxiety for a fash...


In [16]:
print('shape', df.shape)
print('\nnans\n', df.isna().sum())

shape (7787, 2)

nans
 title          0
description    0
dtype: int64


# Creating the embeddings

In [17]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# load the data to db

In [19]:
data = DataFrameLoader(df, page_content_column='description')
descriptions= data.load()
descriptions[2]

Document(metadata={'title': '3 Heroines'}, page_content='Three Indonesian women break records by becoming the first of their nation to medal in archery at the Seoul Olympics in the summer of 1988.')

In [20]:
CHROMA_PATH = 'chroma_movies_db'
chroma_db = Chroma.from_documents(descriptions, embeddings, persist_directory=CHROMA_PATH)

In [21]:
!zip -r chroma_movies_db.zip /content/chroma_movies_db

  adding: content/chroma_movies_db/ (stored 0%)
  adding: content/chroma_movies_db/9f636ddd-2e66-4603-b79c-de24233bf4f7/ (stored 0%)
  adding: content/chroma_movies_db/9f636ddd-2e66-4603-b79c-de24233bf4f7/link_lists.bin (deflated 75%)
  adding: content/chroma_movies_db/9f636ddd-2e66-4603-b79c-de24233bf4f7/length.bin (deflated 88%)
  adding: content/chroma_movies_db/9f636ddd-2e66-4603-b79c-de24233bf4f7/index_metadata.pickle (deflated 43%)
  adding: content/chroma_movies_db/9f636ddd-2e66-4603-b79c-de24233bf4f7/data_level0.bin (deflated 10%)
  adding: content/chroma_movies_db/9f636ddd-2e66-4603-b79c-de24233bf4f7/header.bin (deflated 55%)
  adding: content/chroma_movies_db/chroma.sqlite3 (deflated 31%)


In [29]:
query = 'movie about animal that can speak'
docs = chroma_db.similarity_search(query, k=3)
docs

[Document(metadata={'title': 'Monkey Up'}, page_content='A talking monkey famous for his TV commercials dreams of breaking into real movies. But first he stumbles upon a brother and sister who need his help.'),
 Document(metadata={'title': 'Stars in the Sky: A Hunting Story'}, page_content='This documentary follows a group of hunters as they grapple with the complexities, controversies, and contradictions of pursuing animals in the wild.'),
 Document(metadata={'title': 'Krish Trish and Baltiboy: Best Friends Forever'}, page_content='A cat, monkey and donkey team up to narrate folktales about friendship from Northeast India, and the Indian regions of Bihar and Rajasthan.')]