In [1]:
import os

from dotenv import load_dotenv

from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore

In [2]:
load_dotenv()  # take environment variables from .env

True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [4]:
directory_path = "../dataset/pdfs/"
loader = PyPDFDirectoryLoader(directory_path)

In [5]:
data = loader.load()

In [6]:
data[0]

Document(page_content='Universal Language Model Fine-tuning for Text Classiﬁcation\nJeremy Howard∗\nfast.ai\nUniversity of San Francisco\nj@fast.aiSebastian Ruder∗\nInsight Centre, NUI Galway\nAylien Ltd., Dublin\nsebastian@ruder.io\nAbstract\nInductive transfer learning has greatly im-\npacted computer vision, but existing ap-\nproaches in NLP still require task-speciﬁc\nmodiﬁcations and training from scratch.\nWe propose Universal Language Model\nFine-tuning (ULMFiT), an effective trans-\nfer learning method that can be applied to\nany task in NLP, and introduce techniques\nthat are key for ﬁne-tuning a language\nmodel. Our method signiﬁcantly outper-\nforms the state-of-the-art on six text clas-\nsiﬁcation tasks, reducing the error by 18-\n24% on the majority of datasets. Further-\nmore, with only 100labeled examples, it\nmatches the performance of training from\nscratch on 100×more data. We open-\nsource our pretrained models and code1.\n1 Introduction\nInductive transfer learning 

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

In [8]:
text_chunks = text_splitter.split_documents(data)

In [9]:
text_chunks

[Document(page_content='Universal Language Model Fine-tuning for Text Classiﬁcation\nJeremy Howard∗\nfast.ai\nUniversity of San Francisco\nj@fast.aiSebastian Ruder∗\nInsight Centre, NUI Galway\nAylien Ltd., Dublin\nsebastian@ruder.io\nAbstract\nInductive transfer learning has greatly im-\npacted computer vision, but existing ap-\nproaches in NLP still require task-speciﬁc\nmodiﬁcations and training from scratch.\nWe propose Universal Language Model\nFine-tuning (ULMFiT), an effective trans-\nfer learning method that can be applied to', metadata={'source': '..\\dataset\\pdfs\\ULMFiT.pdf', 'page': 0}),
 Document(page_content='any task in NLP, and introduce techniques\nthat are key for ﬁne-tuning a language\nmodel. Our method signiﬁcantly outper-\nforms the state-of-the-art on six text clas-\nsiﬁcation tasks, reducing the error by 18-\n24% on the majority of datasets. Further-\nmore, with only 100labeled examples, it\nmatches the performance of training from\nscratch on 100×more data. We 

In [10]:
len(text_chunks)

101

In [11]:
print(text_chunks[0].page_content)

Universal Language Model Fine-tuning for Text Classiﬁcation
Jeremy Howard∗
fast.ai
University of San Francisco
j@fast.aiSebastian Ruder∗
Insight Centre, NUI Galway
Aylien Ltd., Dublin
sebastian@ruder.io
Abstract
Inductive transfer learning has greatly im-
pacted computer vision, but existing ap-
proaches in NLP still require task-speciﬁc
modiﬁcations and training from scratch.
We propose Universal Language Model
Fine-tuning (ULMFiT), an effective trans-
fer learning method that can be applied to


In [12]:
print(text_chunks[-1].page_content)

siﬁcation. In Advances in neural information pro-
cessing systems . pages 649–657.
Peng Zhou, Zhenyu Qi, Suncong Zheng, Jiaming Xu,
Hongyun Bao, and Bo Xu. 2016. Text classiﬁcation
improved by integrating bidirectional lstm with two-
dimensional max pooling. In Proceedings of COL-
ING 2016 .


In [13]:
embeddings = OpenAIEmbeddings()

In [14]:
len(embeddings.embed_query("Hey there!"))

1536

In [15]:
pc = Pinecone(api_key=PINECONE_API_KEY)

In [16]:
index_name = "document-qna"

In [17]:
import time

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=2,
        metric="cosine",
        spec=spec
    )
    # wait for index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

In [18]:
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

In [19]:
docsearch = vectorstore.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [20]:
query = "What does ULMFiT solve?"

In [21]:
docs = docsearch.similarity_search(query)

In [22]:
llm = OpenAI()

In [23]:
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever()
)

In [24]:
print(qa.invoke(query)["result"])

 ULMFiT (Universal Language Model Fine-tuning) is a technique used in natural language processing to improve the performance of language models on downstream tasks.

ULMFiT solves the problem of adapting language models to specific tasks or domains, by fine-tuning a pre-trained language model on a specific dataset. This helps to improve the performance of the language model on a particular task, without the need for extensive training on the new dataset. It also addresses the issue of transfer learning, where a language model trained on one task may not perform well on another task without additional training.
