In [1]:
%pip install -qU langchain pypandoc deeplake unstructured InstructorEmbedding torch sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [11]:
%pip show langchain

Name: langchain
Version: 0.0.170
Summary: Building applications with LLMs through composability
Home-page: https://www.github.com/hwchase17/langchain
Author: 
Author-email: 
License: MIT
Location: /home/limcheekin/ws/py/learn-langchain/venv/lib/python3.10/site-packages
Requires: aiohttp, async-timeout, dataclasses-json, numexpr, numpy, openapi-schema-pydantic, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [12]:
# If pandoc not found
#!sudo apt-get install pandoc

In [3]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredEPubLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [4]:
loader = UnstructuredEPubLoader("./books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub")

In [5]:
data = loader.load()

In [6]:
len(data)

1

In [17]:
print(data[0].page_content)
print(data[0].metadata)

Competing Against Time

FREE PRESS
Rockefeller Center
1230 Avenue of the Americas
New York, NY 10020
www.SimonandSchuster.com

Copyright © 1990 by The Free Press

All rights reserved, including the right of reproduction in whole of in part in any form.

FREE PRESS and colophon are trademarks of Macmillan Library Research USA, Inc. under license by Simon & Schuster, the publisher of this work.

Designed by

Manufactured in the United States of America

10 9 8 7 6 5 4 3 2 1

Library of Congress Cataloging-In-Publication Data

Stalk, George.

Competing against time : how time-based competition is reshaping global markets / George Stalk, Jr. (and) Thomas M. Hout.

p.   cm.

Includes bibliographical references.

ISBN 0-7432-5341-8

ISBN 13 978-0-7432-5341-3

eISBN 13 978-1-4391-0541-2

1. Time management.   2. Delivery of goods.   3. Competition, International.   4. Comparative advantage (International trade) I. Hout, Thomas M.   II. Title

HD69.T54S73  1990

658.5’6—dc20   89-23735

CIP

F

In [18]:
loader = UnstructuredEPubLoader("./books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub", mode="elements")

In [19]:
data = loader.load()

In [20]:
data[0]

Document(page_content='Competing Against Time', metadata={'source': './books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub', 'page_number': 1, 'category': 'Title'})

In [21]:
len(data)

2723

### Chunk your data up into smaller documents

In [7]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [8]:
print (f'Now you have {len(texts)} documents')

Now you have 354 documents


### Create embeddings of your documents to get ready for semantic search

In [9]:
from langchain.vectorstores import DeepLake
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [10]:
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [11]:
db = DeepLake.from_documents(texts, dataset_path="./deeplake/SPLIT Competing Against Time_How Time-Based Competition is Reshaping Global Markets", embedding=embeddings, overwrite=True)


./deeplake/SPLIT Competing Against Time_How Time-Based Competition is Reshaping Global Markets loaded successfully.


Evaluating ingest: 100%|██████████| 1/1 [34:13<00:00


Dataset(path='./deeplake/SPLIT Competing Against Time_How Time-Based Competition is Reshaping Global Markets', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype     shape      dtype  compression
  -------   -------   -------    -------  ------- 
 embedding  generic  (354, 768)  float32   None   
    ids      text     (354, 1)     str     None   
 metadata    json     (354, 1)     str     None   
   text      text     (354, 1)     str     None   




In [27]:
query = "Give me overview of the book"
docs = db.similarity_search(query)

In [28]:
# Here's an example of the first document that was returned
print(docs)

[Document(page_content='ISBN 13 978-0-7432-5341-3', metadata={'source': './books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub', 'page_number': 1, 'category': 'UncategorizedText'}), Document(page_content='ISBN 0-7432-5341-8', metadata={'source': './books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub', 'page_number': 1, 'category': 'UncategorizedText'}), Document(page_content='eISBN 13 978-1-4391-0541-2', metadata={'source': './books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub', 'page_number': 1, 'category': 'UncategorizedText'}), Document(page_content='Includes bibliographical references.', metadata={'source': './books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub', 'page_number': 1, 'category': 'Title'})]


### Query those docs to get your answer back

In [18]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [19]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [35]:
db = DeepLake(dataset_path="./deeplake/Competing Against Time_How Time-Based Competition is Reshaping Global Markets", embedding_function=embeddings, read_only=True)

./deeplake/Competing Against Time_How Time-Based Competition is Reshaping Global Markets loaded successfully.





Deep Lake Dataset in ./deeplake/Competing Against Time_How Time-Based Competition is Reshaping Global Markets already exists, loading from the storage
Dataset(path='./deeplake/Competing Against Time_How Time-Based Competition is Reshaping Global Markets', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (2723, 768)  float32   None   
    ids      text     (2723, 1)     str     None   
 metadata    json     (2723, 1)     str     None   
   text      text     (2723, 1)     str     None   




In [37]:
query = "What key messages the book try to convey?"
docs = db.as_retriever().get_relevant_documents(query)

In [38]:
docs

[Document(page_content='ISBN 0-7432-5341-8', metadata={'source': './books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub', 'page_number': 1, 'category': 'UncategorizedText'}),
 Document(page_content='ISBN 13 978-0-7432-5341-3', metadata={'source': './books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub', 'page_number': 1, 'category': 'UncategorizedText'}),
 Document(page_content='eISBN 13 978-1-4391-0541-2', metadata={'source': './books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub', 'page_number': 1, 'category': 'UncategorizedText'}),
 Document(page_content='LESSONS FROM MAD RIVER', metadata={'source': './books/Competing Against Time_How Time-Based Competition is Reshaping Global Markets.epub', 'page_number': 1, 'category': 'Title'})]

In [22]:
chain.run(input_documents=docs, question=query)

' The collect stage of data maturity focuses on collecting internal or external datasets. Gathering sales records and corresponding weather data is an example of the collect stage.'