In [1]:
import pandas as pd
import os

import faiss 
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import CSVLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool

from pprint import pprint

In [2]:
# Chargement de la clé d'API OpenAI
# Faire ainsi est une mauvaise pratique, mais c'est suffisant dans notre contexte
# (Ca sert à rien d'initialiser une variable d'environement en dur, direct dans le script)

os.environ["OPENAI_API_KEY"] = "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"

In [3]:
# Choix des colonnes intéressantes

cols_metadata = ["Date", "Article_title", "Url", "Publisher", "Author"]
cols_content = ["Article"]

# On charge une partie du dataset pour l'explorer

df = pd.read_csv('data.csv', nrows = 1, usecols = cols_metadata + cols_content)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           1 non-null      object 
 1   Article_title  1 non-null      object 
 2   Url            1 non-null      object 
 3   Publisher      0 non-null      float64
 4   Author         0 non-null      float64
 5   Article        1 non-null      object 
dtypes: float64(2), object(4)
memory usage: 180.0+ bytes


In [4]:
# On instancie le loader et on charge le CSV

docs = CSVLoader(
    file_path="./data.csv",
    encoding="utf-8",
    metadata_columns=cols_metadata,
    content_columns=cols_content,
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "skipinitialspace": True,
    }
).load()

docs = docs[:2]

In [5]:
pprint(docs[0].dict())

{'id': None,
 'metadata': {'Article_title': 'My 6 Largest Portfolio Holdings Heading Into '
                               '2024 -- and the Important Investing Lesson I '
                               'Learned From Each One',
              'Author': '',
              'Date': '2023-12-16 22:00:00 UTC',
              'Publisher': '',
              'Url': 'https://www.nasdaq.com/articles/my-6-largest-portfolio-holdings-heading-into-2024-and-the-important-investing-lesson-i',
              'row': 0,
              'source': './data.csv'},
 'page_content': 'Article: After an absolute disaster of a year in 2022, the '
                 'stock market appears to have turned the corner. Each of the '
                 'major market indexes has gained more than 20% from their '
                 'respective trough. Perhaps more importantly, the S&P 500 and '
                 'the Nasdaq Composite are within striking distance of new '
                 'highs, which will check the final box marking t

In [6]:
# On divise les documents en morceaux plus petits

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=50
)

docs = text_splitter.split_documents(docs)

In [7]:
vector_store = FAISS.from_documents(
    documents=docs,
    embedding=OpenAIEmbeddings()
)

retriever = vector_store.as_retriever()

In [8]:
retriever_tool = create_retriever_tool(
    retriever=retriever,
    name="retriever_tool",
    description="A tool to retrieve information related to Apple stocks articles.",
)

tools = [retriever_tool]