# Lesson 3 - Recommender Systems

### Import the required packages

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm, trange

import os
import pandas as pd
import time

In [3]:
from DLAIUtils import Utils

In [4]:
utils = Utils()

PINECONE_API_KEY = utils.get_pinecone_api_key()
OPENAI_API_KEY = utils.get_openai_api_key()

### Load the Dataset

**Note:** To access the dataset outside of this course, just copy the following two lines of code and run it (remember to uncomment them first before executing):

In [5]:
!mkdir -p ./data && wget -q --show-progress -O ./data/all-the-news-3.zip "https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1"

!unzip ./data/all-the-news-3.zip -d ./data/

Archive:  ./data/all-the-news-3.zip
  inflating: ./data/all-the-news-3.csv  


Read the header of the csv file

In [6]:
with open("./data/all-the-news-3.csv", mode="r") as f:
    header = f.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [7]:
df = pd.read_csv(filepath_or_buffer="./data/all-the-news-3.csv", nrows=99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


### Setup Pinecone

In [8]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)
pinecone = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = utils.create_dlai_index_name(index_name="dl-ai")

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(name=INDEX_NAME)

pinecone.create_index(
    name=INDEX_NAME,
    dimension=1536,
    spec=ServerlessSpec(cloud="aws", region="us-west-2"),
    metric="cosine"
)

index = pinecone.Index(name=INDEX_NAME)

### 1. Create Embeddings of the News Titles

In [9]:
def get_embeddings(articles, model="text-embedding-ada-002"):
    return openai_client.embeddings.create(input=articles, model=model)

In [11]:
CHUNK_SIZE = 400
TOTAL_ROWS = 10000

progress_bar = tqdm(total=TOTAL_ROWS)
chunks = pd.read_csv(filepath_or_buffer="./data/all-the-news-3.csv", chunksize=CHUNK_SIZE, nrows=TOTAL_ROWS)

chunk_num = 0
for chunk in chunks:
    titles = chunk["title"].tolist()
    embeddings = get_embeddings(articles=titles)
    prepped = [{"id": str(chunk_num*CHUNK_SIZE+i), "values": embeddings.data[i].embedding, "metadata": {"title": titles[i]}} for i in range(len(titles))]
    chunk_num += 1

    if len(prepped) > 200:
        index.upsert(vectors=prepped)
        prepped = []
    
    progress_bar.update(len(chunk))


100%|██████████| 10000/10000 [03:05<00:00, 45.65it/s]

100%|██████████| 10000/10000 [15:31<00:00, 45.65it/s]

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

### Build the Recommender System

In [13]:
def get_recommendations(pinecone_index, search_term, top_k=10):
    embed = get_embeddings(articles=[search_term]).data[0].embedding
    res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)

    return res

In [16]:
reco = get_recommendations(pinecone_index=index, search_term="obama")

for r in reco.matches:
    print(f'{r.score}: {r.metadata["title"]}')

0.849979818: Barack Obama just stepped off the sidelines to defend Obamacare
0.849476159: Obama: if you were fine with big government until it served black people, rethink your biases
0.84846884: “Our democracy is at stake”: Obama delivers his first post-presidency campaign speech
0.848202527: President Obama has a new plan to fight the opioid epidemic
0.844891369: Watch President Obama dance the tango in Argentina
0.844519079: Obama meets with national security team on Syria, Islamic State
0.844460905: President Obama: Michelle & I Are Gonna Be Renters
0.844327092: Vox Sentences: Obama got a warmer welcome in Hiroshima than the Japanese prime minister
0.842268705: Barack Obama in talks to create shows for Netflix: New York Times
0.841716468: Clinton, Obama pledge unity behind Trump presidency


### 2. Create Embeddings of All News Content

In [17]:
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(name=INDEX_NAME)

pinecone.create_index(
    name=INDEX_NAME,
    dimension=1536,
    spec=ServerlessSpec(cloud="aws", region="us-west-2"),
    metric="cosine"
)

articles_index = pinecone.Index(name=INDEX_NAME)

In [18]:
def embed(embeddings, title, prepped, embed_num):
    for embedding in embeddings.data:
        prepped.append({"id": str(embed_num), "values": embedding.embedding, "metadata": {"title": title}})
        embed_num += 1

        if len(prepped) > 100:
            articles_index.upsert(vectors=prepped)
            prepped.clear()
    
    return embed_num

<p style="background-color:#fff1d7; padding:15px; "> <b>(Note: <code>news_data_rows_num = 100</code>):</b> In this lab, we've initially set <code>news_data_rows_num</code> to 100 for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, consider increasing this value to 200, 400, 700, and 1000. You'll likely notice better and more relevant results.</p>

In [19]:
news_data_rows_num = 100

embed_num = 0 # Keep track of embedding number for "id"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20) # How to chunk each article

df = pd.read_csv(filepath_or_buffer="./data/all-the-news-3.csv", nrows=news_data_rows_num)
articles_list = df["article"].tolist()
titles_list = df["title"].tolist()

prepped = []
for i in range(0, len(articles_list)):
    print(".",end="")
    art = articles_list[i]
    title = titles_list[i]

    if art is not None and isinstance(art, str):
        texts = text_splitter.split_text(art)
        embeddings = get_embeddings(articles=texts)
        embed_num = embed(embeddings=embeddings, title=title, prepped=prepped, embed_num=embed_num)


....................................................................................................

In [20]:
articles_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1010}},
 'total_vector_count': 1010}

### Build the Recommender System

In [21]:
reco = get_recommendations(pinecone_index=articles_index, search_term="obama", top_k=100)
seen = {}

for r in reco.matches:
    title = r.metadata["title"]
    if title not in seen:
        print(f"{r.score}: {title}")
        seen[title] = "."

0.821058929: Why Obama is vetting Nevada's Republican governor for the Supreme Court
0.818831205: U.S. lawmakers ask for disclosure of number of Americans under surveillance
0.812379181: NYPD Honcho Insulted by 'Hamilton' Star Lin-Manuel Miranda Celebrating Obama's Controversial Prisoner Release
0.806931: Why Jews Are Getting Themselves Arrested at ICE Centers Around the Country
0.806235075: Trump keeping options open as Republican feud rages
0.801288128: Michael Bloomberg Is Seriously Considering a Presidential Run
0.800196826: The most revealing Republican ad of the election is an attack ad against Tim Kaine
0.798193276: Exclusive: Trump considering fracking mogul Harold Hamm as energy secretary - sources
0.797721624: Trump tells anti-abortion marchers he will support them
0.797481894: The government official in charge of ethics just harshly condemned Trump’s plan
0.79356277: Exclusive: China shuns U.S. request for talks on airline website dispute over Taiwan
0.792402804: “Elizabeth 