# 1. ChromaDB 설치 및 불러오기

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl (525 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/525.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/525.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.110.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from c

In [3]:
import chromadb
import warnings
warnings.filterwarnings(action='ignore')

# 2. 데이터 불러오기
- 데이터 출처 : https://www.kaggle.com/datasets/ahbab911/top-250-korean-dramas-kdrama-dataset?ref=breezymind.com

In [4]:
import pandas as pd
data_path = "/content/drive/MyDrive/kdrama.csv"

df = pd.read_csv(data_path)
filter_df = df.drop(["Aired Date","Aired On","Duration","Content Rating","Production companies", "Rank"], axis=1);

# 3. VectorDB

## 3-1. Client

In [5]:
client = chromadb.PersistentClient()

## 3-2. Collection
- Collection : embedding vector, document, metadata 등을 저장하는 곳
- metadata : 필터링을 위한 추가 정보 등을 저장하는 딕셔너리
  - 딕셔너리 키를 "hnsw:space"로 설정하여, embedding space의 distance method를 변경할 수 있음 (기본값=L2)

In [6]:
collection = client.get_or_create_collection(
    name="k-drama",
    metadata={"hnsw:space": "cosine"}
)

## 3-3. DataFrame to VectorDB

In [7]:
ids = []        # index
doc_meta = []   # metadata
documents = []  # vectors converted from text data

for idx in range(len(filter_df)):
    item = filter_df.iloc[idx]

    id = item['Name'].lower().replace(' ','-')
    document = f"{item['Name']} : {item['Synopsis']} : {str(item['Cast']).strip().lower()} : {str(item['Genre']).strip().lower()}"
    meta = {
        "rating" : item['Rating']
    }

    ids.append(id)
    doc_meta.append(meta)
    documents.append(document)

In [8]:
# Save to Database (VectorDB)
collection.add(
    documents=documents,
    metadatas=doc_meta,
    ids=ids
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 70.9MiB/s]


# 4. Query

In [9]:
collection.query(
    query_texts=["medical drama about doctors"],
    n_results=5,
)

{'ids': [['doctor-john',
   'dr.-romantic-2',
   'hospital-playlist',
   'doctor-prisoner',
   "god's-quiz:-reboot"]],
 'distances': [[0.3436622619628906,
   0.45732051134109497,
   0.4621430039405823,
   0.4815513491630554,
   0.5326414392990194]],
 'metadatas': [[{'rating': 8.5},
   {'rating': 8.7},
   {'rating': 9.1},
   {'rating': 8.4},
   {'rating': 8.3}]],
 'embeddings': None,
 'documents': [['Doctor John : “Doctor John” is a medical drama about doctors specializing in pain management. In a refreshing take on the genre, “Doctor John” will portray the doctors’ search for the cause of their patients’ mysterious pain as a thrilling chase, almost like a detective hunting down the perpetrator behind an unsolved crime.  : ji sung, lee se young, lee kyu hyung, hwang hee, jung min ah, kwon hwa woon : mystery,  romance,  life,  medical',
   'Dr. Romantic 2 : A “real doctor” story set in a small, humble hospital called Doldam Hospital. It is a story about people who meet Kim Sa Bu, a geniu

In [10]:
collection.query(
    query_texts=["time-travel drama"],
    n_results=5,
)

{'ids': [['365',
   'nine:-nine-times-time-travel',
   'circle',
   'because-this-is-my-first-life',
   'one-ordinary-day']],
 'distances': [[0.5034234523773193,
   0.5676982998847961,
   0.5819253921508789,
   0.5891983509063721,
   0.6088154911994934]],
 'metadatas': [[{'rating': 8.6},
   {'rating': 8.4},
   {'rating': 8.5},
   {'rating': 8.5},
   {'rating': 8.5}]],
 'embeddings': None,
 'documents': [['365 : A story where ten people get the chance to go back in time by one year, but unexpectedly mysterious situations start to arise when their fates are changed and twisted in the process. : lee joon hyuk, nam ji hyun, kim jee soo, lee sung wook, yoon joo sang, ahn seung gyun : thriller,  mystery,  drama,  fantasy',
   'Nine: Nine Times Time Travel : Park Sun Woo works as an anchorman at a TV broadcasting station. He is in love with news reporters Joo Min Young, who is bright and honest. Park Sun Woo then obtains 9 incense items, which allows him to go back 20 years in time. Sun Woo t

출처
- https://docs.trychroma.com/getting-started
- https://breezymind.com/vector-db-openai-response-optimize/


# 5. 과제
- 아래 데이터를 이용하여 공포 영화 5개 출력하기
- data path 변경하기!!!
- 데이터 출처 : https://www.kaggle.com/datasets/narayan63/netflix-popular-movies-dataset

In [7]:
# data path 변경하기!!
hw_data_path = "/content/drive/MyDrive/netflix.csv"

hw_df = pd.read_csv(hw_data_path)

In [8]:
hw_filter_df = hw_df.drop(["year", "certificate", "duration", "votes"], axis=1).drop_duplicates('title');

def changeString(x):
  return x.replace('\'','').replace('[','').replace(']','').replace(', ,',',')

hw_filter_df['stars'] = hw_filter_df['stars'].apply(changeString)
hw_filter_df = hw_filter_df.iloc[:300]

In [9]:
hw_filter_df

Unnamed: 0,title,genre,rating,description,stars
0,Cobra Kai,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"Ralph Macchio, William Zabka, Courtney Henggel..."
1,The Crown,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"Claire Foy, Olivia Colman, Imelda Staunton, Ma..."
2,Better Call Saul,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"Bob Odenkirk, Rhea Seehorn, Jonathan Banks, Pa..."
3,Devil in Ohio,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"Emily Deschanel, Sam Jaeger, Gerardo Celasco, ..."
4,Cyberpunk: Edgerunners,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"Zach Aguilar, Kenichiro Ohashi, Emi Lo, Aoi Yûki"
...,...,...,...,...,...
296,Incantation,"Horror, Mystery",6.2,"Six years ago, Li Ronan was cursed after break...","Kevin Ko, | , Stars:, Hsuan-yen Tsai, Ying..."
297,Queen of the South,"Action, Crime, Drama",8.0,Teresa flees Mexico after her drug-runner boyf...,"Alice Braga, Hemky Madera, Peter Gadiot, Veron..."
298,Godless,"Action, Drama, Western",8.3,"In the 1880s American West, murderous outlaw g...","""Jack OConnell, "", Michelle Dockery, Scoot McN..."
299,Fullmetal Alchemist: Brotherhood,"Animation, Action, Adventure",9.1,Two brothers search for a Philosopher's Stone ...,"Kent Williams, Iemasa Kayumi, Matthew Leonhart..."


여기서부터 코드 입력

In [11]:
import chromadb
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd

client = chromadb.PersistentClient()

hw_collection = client.get_or_create_collection(
    name="netflix",
    metadata={"hnsw:space": "cosine"}
)

hw_ids = []        # index
hw_doc_meta = []   # metadata
hw_documents = []  # vectors converted from text data

for idx in range(len(hw_filter_df)):
    item = hw_filter_df.iloc[idx]

    id = item['title'].lower().replace(' ','-')
    document = f"{item['title']} : {item['description']} : {str(item['stars']).strip().lower()} : {str(item['genre']).strip().lower()}"
    meta = {
        "rating" : item['rating']
    }

    hw_ids.append(id)
    hw_doc_meta.append(meta)
    hw_documents.append(document)

hw_collection.add(
    documents=hw_documents,
    metadatas=hw_doc_meta,
    ids=hw_ids
)

hw_collection.query(
    query_texts=["horror"],
    n_results=5
)




{'ids': [['american-horror-story', 'it', 'from', 'devil-in-ohio', 'evil']],
 'distances': [[0.6297610998153687,
   0.6586829423904419,
   0.6754425168037415,
   0.676643967628479,
   0.6823486089706421]],
 'metadatas': [[{'rating': 8.0},
   {'rating': 7.3},
   {'rating': 7.6},
   {'rating': 5.9},
   {'rating': 7.7}]],
 'embeddings': None,
 'documents': [['American Horror Story : An anthology series centering on different characters and locations, including a house with a murderous past, an insane asylum, a witch coven, a freak show circus, a haunted hotel, a possessed farmhouse, a cult, the apocalypse, a slasher summer camp, and a bleak beach town and desert valley. : lady gaga, kathy bates, angela bassett, sarah paulson : drama, horror, sci-fi',
   'It : In the summer of 1989, a group of bullied kids band together to destroy a shape-shifting monster, which disguises itself as a clown and preys on the children of Derry, their small Maine town. : andy muschietti, | ,     stars:, bill sk

In [15]:
## your code
horror_movies = hw_filter_df[hw_filter_df['genre'].str.contains('Horror')]
horror_movies.head(5)

Unnamed: 0,title,genre,rating,description,stars
3,Devil in Ohio,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"Emily Deschanel, Sam Jaeger, Gerardo Celasco, ..."
5,The Sandman,"Drama, Fantasy, Horror",7.8,Upon escaping after decades of imprisonment by...,"Tom Sturridge, Boyd Holbrook, Patton Oswalt, V..."
10,Stranger Things,"Drama, Fantasy, Horror",8.7,"When a young boy disappears, his mother, a pol...","Millie Bobby Brown, Finn Wolfhard, Winona Ryde..."
12,The Walking Dead,"Drama, Horror, Thriller",8.1,Sheriff Deputy Rick Grimes wakes up from a com...,"Andrew Lincoln, Norman Reedus, Melissa McBride..."
17,1899,"Drama, History, Horror",9.6,Multinational immigrants traveling from the ol...,"Ben Ashenden, Aneurin Barnard, Emily Beecham, ..."


## 과제 답안 예시

In [2]:
hw_collection = client.get_or_create_collection(
    name="netflix",
    metadata={"hnsw:space": "cosine"}
)

hw_ids = []        # index
hw_doc_meta = []   # metadata
hw_documents = []  # vectors converted from text data

for idx in range(len(hw_filter_df)):
    item = hw_filter_df.iloc[idx]

    id = item['title'].lower().replace(' ','-')
    document = f"{item['title']} : {item['description']} : {str(item['stars']).strip().lower()} : {str(item['genre']).strip().lower()}"
    meta = {
        "rating" : item['rating']
    }

    hw_ids.append(id)
    hw_doc_meta.append(meta)
    hw_documents.append(document)

hw_collection.add(
    documents=hw_documents,
    metadatas=hw_doc_meta,
    ids=hw_ids
)

hw_collection.query(
    query_texts=["horror film"],
    n_results=5,
)

ModuleNotFoundError: No module named 'chromadb'