In [7]:
from langchain.document_loaders import DirectoryLoader
import nltk

directory = 'data'

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

def load_docs(directory):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    return documents

documents = load_docs(directory)
len(documents)

[nltk_data] Downloading package punkt_tab to /Users/river/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /Users/river/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/river/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


2

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents, chunk_size = 500, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

docs = split_docs(documents)
print(len(docs))


8


In [9]:
from langchain.embeddings import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
2024-12-06 15:08:52.098065: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec


load_dotenv()
pinecone_key = os.getenv("PINECONE_API_KEY")

index_name = "ai-assistant"

pc = Pinecone(pinecone_key)

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Replace with your embedding model's dimension
        metric="cosine",  # Metric can be 'cosine', 'dotproduct', or 'euclidean'
        spec=ServerlessSpec(cloud="aws", region="us-east-1")  # Replace region with your preferred one
    )



In [22]:
from langchain.vectorstores import Pinecone
index = Pinecone.from_documents(documents, embeddings, index_name=index_name)

In [23]:
def get_similar_docs(query, k=2, score=False):
    if score:
        similar_docs = index.similarity_search_with_score(query, k=k)
    else:
        similar_docs = index.similarity_search(query, k=k)
    
    return similar_docs

In [24]:
query = "Who took over Twitter"
similar_docs = get_similar_docs(query)
print(similar_docs)

[Document(metadata={'source': 'data/twitter_data.pdf'}, page_content='Twitter was founded in 2006 and was listed on the stock exchange in 2013. Since the founding of Twitter, 2022 has been an event to remember Twitter. As Elon Musk took over Twitter, it will be delisted from the New York Exchange. As 2022 was so eventful for Twitter, analyze the complete timeline of Twitter in the Stock Market from 2013 to 2022.\n\nTwitter is one of the popular social media applications where people share what they feel in a limited number of words. Twitter is popular but not in the stock market.\n\nThe dataset contains data about:\n\nDate\n\nThe opening Price of the day\n\nThe highest price of the day\n\nThe lowest price of the day\n\nThe closing price of the day\n\nThe adjusted closing price of the day\n\nThe total number of shares traded in the day (volume)'), Document(metadata={'source': 'data/elon.txt'}, page_content="Elon Musk is a renowned entrepreneur and business magnate known for his involvem