In [5]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

In [12]:
loader = TextLoader(file_path="./data/harrypotter1_short.txt")

In [13]:
data = loader.load()

In [14]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your sample document')
print (f'Here is a sample: {data[0].page_content[:200]}')

You have 1 document(s) in your data
There are 66762 characters in your sample document
Here is a sample: Harry Potter and the Sorcerer's Stone


CHAPTER ONE

THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They


# Create chunks from text

In [18]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
texts = text_splitter.split_documents(data)

In [19]:
print (f'Now you have {len(texts)} documents')

Now you have 198 documents


# Generate summaries using custom T5 based model

In [23]:
model_id = 'jruranski/flan-t5-small-samsum'

In [24]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM


In [25]:

device = torch.device('gpu' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True) 

In [26]:
pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=700
)

In [27]:
local_llm = HuggingFacePipeline(pipeline=pipe)

In [28]:
print(local_llm('summarize: ' + data[0].page_content[:400] + '...'))

Harry Potter and the Sorcerer's Stone were proud to say that they were perfectly normal.


In [29]:
sum_texts = texts[:50]

In [30]:
summaries = [local_llm("summarize: " + chunk.page_content) for chunk in sum_texts]


In [32]:
print(texts[4])
print("==================\nSummary generated: ")
print(summaries[4])

page_content='When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story\nstarts, there was nothing about the cloudy sky outside to suggest that\nstrange and mysterious things would soon be happening all over the\ncountry. Mr. Dursley hummed as he picked out his most boring tie for\nwork, and Mrs. Dursley gossiped away happily as she wrestled a screaming\nDudley into his high chair.\n\nNone of them noticed a large, tawny owl flutter past the window.' metadata={'source': './data/harrypotter1_short.txt'}
Summary generated: 
Mr. and Mrs. Dursley woke up on the dull, gray Tuesday.


# Create embeddings in vector store (Chroma) from chunks

In [33]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
import chromadb

In [8]:
!pip install langchain sentence_transformers



In [34]:
from langchain.embeddings import HuggingFaceEmbeddings

In [35]:
embeddings = HuggingFaceEmbeddings()

In [36]:
text = 'This is a test document'
query_result = embeddings.embed_query(text)
query_result[:3]

[-0.03844855725765228, -0.055053550750017166, -0.015172900632023811]

In [37]:
# Create documents from the summaries

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Save the combinded summaries to a file
with open('./data/summaries.txt', 'w') as f:
    for summary in summaries:
        f.write(summary + '\n')

In [38]:
loader = TextLoader(file_path="./data/summaries.txt")
summary_data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=150)
texts = text_splitter.split_documents(data)

In [39]:
# vectorstore = Chroma.from_documents(texts, embeddings)
vectorstore = Chroma.from_documents(texts, embeddings)

In [42]:
query = 'Who is Dudley?'
docs = vectorstore.similarity_search(query, k=5)

In [43]:
for doc in docs:
    print ( doc.page_content[:400])

Gordon were all big and stupid, but as Dudley was the biggest and
stupidest of the lot, he was the leader. The rest of them were all quite
happy to join in Dudley's favorite sport: Harry Hunting.
Harry was frying eggs by the time Dudley arrived in the kitchen with his
mother. Dudley looked a lot like Uncle Vernon. He had a large pink face,
not much neck, small, watery blue eyes, and thick blond hair that lay
was almost hidden beneath all Dudley's birthday presents. It looked as
though Dudley had gotten the new computer he wanted, not to mention the
second television and the racing bike. Exactly why Dudley wanted a
of Dudley's, and Dudley was about four times bigger than he was. Harry
had a thin face, knobbly knees, black hair, and bright green eyes. He
wore round glasses held together with a lot of Scotch tape because of
not much neck, small, watery blue eyes, and thick blond hair that lay
smoothly on his thick, fat head. Aunt Petunia often said that Dudley
looked like a baby angel -- 

# Create a chat interface with OpenAI GPT-3.5

In [2]:
os.environ["OPENAI_API_KEY"] = "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"

In [3]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'YourAPIKeyIfNotSet')
print(OPENAI_API_KEY)

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


In [4]:
llm = OpenAI(temperature=0.2, openai_api_key=OPENAI_API_KEY)

NameError: name 'OpenAI' is not defined