In [56]:
import os
from dotenv import load_dotenv
import pinecone
from bs4 import BeautifulSoup
import requests
from transformers import AutoTokenizer, AutoModel
import torch

# from sentence_transformers import SentenceTransformer

#load env variables
load_dotenv()
#needed env variables
PINECONE_ENVIRONMENT=os.getenv('PINECONE_ENVIRONMENT') 
PINECONE_KEY=os.getenv('PINECONE_KEY') 

pinecone.init(api_key=PINECONE_KEY, environment=PINECONE_ENVIRONMENT)


# Identify the device and move model to it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')

Device: cuda


In [70]:
pinecone.list_indexes()

['artciles']

In [71]:
# Create a new vector index
index_name = 'articles'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=768,
        metric='cosine'
    )

In [76]:
# Set the url of the article to fetch
url = "https://www.cnbc.com/2023/07/27/ford-motor-f-earnings-q2-2023.html"

# Fetch the article data
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object
soup = BeautifulSoup(response.text, "html.parser")

# Using the 'soup' object, we can extract all text inside paragraph 'p' tags.
article_text = soup.get_text()

print(article_text)


Ford Motor (F) earnings Q2 2023Skip NavigationwatchliveMarketsPre-MarketsU.S. MarketsCurrenciesCryptocurrencyFutures & CommoditiesBondsFunds & ETFsBusinessEconomyFinanceHealth & ScienceMediaReal EstateEnergyClimateTransportationIndustrialsRetailWealthLifeSmall BusinessInvestingPersonal FinanceFintechFinancial AdvisorsOptions ActionETF StreetBuffett ArchiveEarningsTrader TalkTechCybersecurityEnterpriseInternetMediaMobileSocial MediaCNBC Disruptor 50Tech GuidePoliticsWhite HousePolicyDefenseCongressEquity and OpportunityCNBC TVLive TVLive AudioBusiness Day ShowsEntertainment ShowsFull EpisodesLatest VideoTop VideoCEO InterviewsCNBC DocumentariesCNBC PodcastsCNBC WorldDigital OriginalsLive TV ScheduleWatchlistInvesting ClubTrust PortfolioAnalysisTrade AlertsMeeting VideosHomestretchJim's ColumnsEducationPROPro NewsPro LiveMarket ForecastSubscribeSign InMenuMake ItselectALL SELECTCredit Cards Loans Banking Mortgages Insurance Credit Monitoring Personal Finance Small Business Taxes Help for

In [77]:

# Tokenize the data and convert to vector
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
model = model.to(device)


inputs = tokenizer(article_text, truncation=True, max_length=512, padding='max_length', return_tensors='pt')
# Ensure all tensors are on the same device
inputs = {name: tensor.to(device) for name, tensor in inputs.items()}


outputs = model(**inputs)

# Convert the model output to a vector, remember to bring it back to CPU before converting to numpy
vector = outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy().reshape(1, -1)

# Convert 2D numpy array to list of 1D numpy arrays
# vectors = [vector for vector in vectors]

# Push the vector into Pinecone
index = pinecone.Index(index_name=index_name)
index.upsert(vectors= [
    ( 
        url, 
        [vector.tolist()],
        {'genre': 'business'}
    )
]
)

# index.upsert(items={url: vector})


# Remember to de-initialize when done
# pinecone.deinit()


ServiceException: (503)
Reason: Service Unavailable
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain', 'content-length': '19', 'date': 'Fri, 28 Jul 2023 03:47:13 GMT', 'server': 'envoy', 'connection': 'close'})
HTTP response body: no healthy upstream


In [73]:
query_text = "Ford Earnings"

# Tokenize the input, ensuring that the input length does not exceed the model's maximum
# This truncates the input if necessary
inputs = tokenizer(query_text, truncation=True, max_length=512, padding='max_length', return_tensors='pt')
inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

# Pass the inputs through the model
outputs = model(**inputs)

# Convert the model output to a vector, remember to bring it back to CPU before converting to numpy
query_vector = outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy()

# Convert 2D numpy array to list of 1D numpy arrays
# query_vector = [vector for vector in query_vector]

# now query
xc = index.query(query_vector.tolist(), top_k=5, include_metadata=True)
xc

NotFoundException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'date': 'Fri, 28 Jul 2023 03:46:19 GMT', 'server': 'envoy', 'connection': 'close', 'content-length': '0'})
