In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")

In [2]:
import requests
from langchain.document_loaders import PyPDFLoader

urls = [
    'https://arxiv.org/pdf/2306.06031v1.pdf',
    'https://arxiv.org/pdf/2306.12156v1.pdf',
    'https://arxiv.org/pdf/2306.14289v1.pdf',
    'https://arxiv.org/pdf/2305.10973v1.pdf',
    'https://arxiv.org/pdf/2306.13643v1.pdf'
]

ml_papers = []

for i, url in enumerate(urls):
    response =  requests.get(url)
    filename = f'paper{i+1}.pdf'
    with open(filename, 'wb') as f:
        f.write(response.content)
        print(f'Descargado {filename}') 

        loader = PyPDFLoader(filename) # Cargar la info que queremos (el nombre del documento)
        data = loader.load() # load vamos a tener un document, una clase paara que langchain procese su info
        ml_papers.extend(data) # Tener una gran lista para unir los pdfs


Descargado paper1.pdf
Descargado paper2.pdf
Descargado paper3.pdf
Descargado paper4.pdf
Descargado paper5.pdf


In [3]:
type(ml_papers), len(ml_papers), ml_papers[4]

(list,
 57,
 Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-06-12T00:32:18+00:00', 'author': '', 'keywords': '', 'moddate': '2023-06-12T00:32:18+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'templateversion': 'IJCAI.2023.0', 'title': '', 'trapped': '/False', 'source': 'paper1.pdf', 'total_pages': 7, 'page': 4, 'page_label': '5'}, page_content='ingest data in real-time. This data could be streaming from\nour data source APIs. Below are the steps to design a real-\ntime NLP pipeline for data ingestion.\nData cleaning: Real-time data can be noisy and inconsis-\ntent. Therefore, real-time data cleaning involves removing\nirrelevant data, handling missing values, text normalization\n(like lowercasing), and error corrections.\nTokenization: In real-time applications, tokenization has\nto be performed on the fly. This involves breaking down the\nstre

In [4]:
# Convertir el texto n numero con embbedings
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter =  RecursiveCharacterTextSplitter(
    chunk_size = 1500, # Total de caracteres
    chunk_overlap = 200, # Caracteres que se comparten con el siguiente chunk
    length_function = len
)

documents =  text_splitter.split_documents(ml_papers)


In [5]:
len(documents), documents[10]

(211,
 Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-06-12T00:32:18+00:00', 'author': '', 'keywords': '', 'moddate': '2023-06-12T00:32:18+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'templateversion': 'IJCAI.2023.0', 'title': '', 'trapped': '/False', 'source': 'paper1.pdf', 'total_pages': 7, 'page': 2, 'page_label': '3'}, page_content='highly volatile, changing rapidly in response to news events\nor market movements.\nTrends, often observable through websites like Seeking\nAlpha, Google Trends, and other finance-oriented blogs and\nforums, offer critical insights into market movements and in-\nvestment strategies. They feature:\n• Analyst perspectives: These platforms provide access to\nmarket predictions and investment advice from seasoned\nfinancial analysts and experts.\n• Market sentiment: The discourse on these platforms can\nreflect th

# Embeddings e ingsta a base de datos

In [6]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings =  OpenAIEmbeddings(
    openai_api_key = OPEN_AI_KEY, 
    model ='text-embedding-ada-002'
)

vectorstore =  Chroma.from_documents(
    documents = documents,
    embedding = embeddings
)

retriever = vectorstore.as_retriever(
    search_kwargs = {"k":3}
)

  embeddings =  OpenAIEmbeddings(


# Modelos de chat y cadenas para consulta de informacion

In [7]:
# Definir el modelo de lenguaje que resolvera la pregunta

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

chat = ChatOpenAI(
    openai_api_key = OPEN_AI_KEY,
    model_name = 'gpt-3.5-turbo',
    temperature =  0.0
)

qa_chain =  RetrievalQA.from_chain_type(
    llm = chat,
    chain_type= "stuff",
    retriever = retriever
)

  chat = ChatOpenAI(


In [10]:
query = "Que es fingpt?"
print(qa_chain.invoke(query)['result'])

FinGPT es un modelo de lenguaje de código abierto diseñado para el procesamiento de lenguaje natural en el ámbito financiero. Se enfoca en aprovechar el potencial de los modelos de lenguaje grandes para aplicaciones financieras, como servicios de asesoramiento robótico, trading cuantitativo y desarrollo de bajo código. FinGPT adopta un enfoque centrado en los datos y cuenta con un marco de trabajo de extremo a extremo con cuatro capas para garantizar la calidad de los datos y abordar la sensibilidad temporal de los datos financieros. Además, FinGPT forma parte de la comunidad de inteligencia artificial AI4Finance, con el objetivo de fomentar la innovación, democratizar los modelos de lenguaje financiero y desbloquear nuevas oportunidades en finanzas abiertas.
