In [2]:
import pymongo
from pymongo import MongoClient
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import openai 
from dotenv import load_dotenv
import os
import shutil
import argparse
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from pymongo import MongoClient
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

chroma_client = chromadb.Client()
CHROMA_PATH = "chroma"
load_dotenv()
DATA_PATH = "downloaded_pdfs"

In [4]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")





In [6]:
def generate_data_store():
    documents = load_documents()
    chunks = split_documents(documents)
    save_to_chroma(chunks)


def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
        add_start_index=True,
    )
    return text_splitter.split_documents(documents)

def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, embeddings, persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")


In [8]:
from PyPDF2 import PdfReader
def is_pdf_readable(file_path):
    """Check if a PDF file is readable."""
    try:
        with open(file_path, "rb") as f:
            pdf = PdfReader(f)
            if not pdf.pages:  # If there are no pages, the file is likely corrupted
                return False
            return True
    except Exception:
        return False  # Any error means the file is unreadable

def delete_unreadable_pdfs():
    """Find and delete unreadable PDF files in DATA_PATH."""
    for file in os.listdir(DATA_PATH):
        if file.endswith(".pdf"):
            file_path = os.path.join(DATA_PATH, file)
            if os.path.getsize(file_path) == 0 or not is_pdf_readable(file_path):  # Check for empty/corrupt files
                print(f"Deleting unreadable PDF: {file}")
                os.remove(file_path)

delete_unreadable_pdfs()
print("Cleanup complete.")

Deleting unreadable PDF: Verge-Anonymity-Centric-CryptoCurrency.pdf
Deleting unreadable PDF: Viberate.io_Whitepaper.pdf
Deleting unreadable PDF: vite_en.pdf
Deleting unreadable PDF: white_paper-2dc8c02267a8fb86bd67a108199441bf.pdf
Deleting unreadable PDF: White_Paper.pdf
Deleting unreadable PDF: WinkLink%20white%20paper.pdf
Deleting unreadable PDF: zerocaf.pdf
Cleanup complete.


In [9]:
generate_data_store()

Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 40 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 44 0 (offset 0)
Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wron

Saved 15274 chunks to chroma.


  db.persist()


In [10]:
from transformers import pipeline

def generate_response(prompt,model_name,tokenizer):
    generator = pipeline("question-answering", model=model_name, tokenizer=tokenizer)
    response = generator(prompt)
    return response['answer'] if 'answer' in response else "No answer found"

In [11]:
import subprocess
#from chromadb.utils import Chroma
from langchain.prompts.chat import ChatPromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

def run_ollama(model_name, prompt):
    """
    Run the Ollama model via its CLI and capture the output.
    """
    try:
        # Run the model with input via stdin
        result = subprocess.run(
            ["ollama", "run", model_name],
            input=prompt,  # Provide the prompt as input
            capture_output=True,
            text=True,
            check=True,
        )
        return result.stdout.strip()  # Return the model's output
    except subprocess.CalledProcessError as e:
        print(f"Error running Ollama: {e}")
        print(f"Command output: {e.output}")
        return None



def main(text):
    model_name = 'llama3.2:latest'  # Use your model from Ollama
    query_text = text
    
    # Embed the query text using HuggingFaceEmbeddings
    embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    results = db.similarity_search_with_relevance_scores(query_text, k=3)

    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")
        return

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Generate a response using the Ollama model
    response_text = run_ollama(model_name, prompt)

    if not response_text:
        print("Failed to generate a response using Ollama.")
        return

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)


In [13]:
main("What Is Cryptocurrency?")

Response: Cryptocurrency is a digital means of financial exchange that can be exchanged online for goods and services, and it works using a technology called blockchain. It was originally intended to overcome limitations of existing currencies and financial transactions, but its value is governed by algorithms and technology rather than a central authority, making its value volatile and attractive as a speculative investment.
Sources: ['downloaded_pdfs\\12.28.21_crypto_seminar.pdf', 'downloaded_pdfs\\12.28.21_crypto_seminar.pdf', 'downloaded_pdfs\\CBP-8780.pdf']


In [14]:
main("what is the  total value of cryptocurrencies?")

Unable to find matching results.


In [15]:
main("What is Trading")

Unable to find matching results.


In [16]:
main("What are the benefits of cryptocurrency?")

Unable to find matching results.


In [17]:
main("What is Diligence?")

Unable to find matching results.


In [23]:
from langchain_ollama.llms import OllamaLLM
from langchain.prompts import PromptTemplate
model = OllamaLLM(model="llama3.2")
template = """You are a Cryptocurrency consultant chatbot.

Answer the customer's questions only using the source data provided. Please answer to their specific question. If you are unsure, say "I don't know, please call our customer support". Use engaging, courteous, and professional language similar to a customer representative.
Keep your answers concise.

{context}

"""
prompt = PromptTemplate(template=template, input_variables=["context"])
formatted_prompt = prompt.format(
    context="A customer is on the cryptocurrency website and wants to chat with the website chatbot. They will ask you a question. Please answer to their specific question"
)
chain_type_kwargs = {"prompt": prompt}  # Pass our custom prompt template to the chain.
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
from langchain.chains import RetrievalQA
chain = RetrievalQA.from_chain_type(
    llm=model, 
    chain_type="stuff",  
    retriever=db.as_retriever(search_kwargs={"k": 1}),  
    chain_type_kwargs=chain_type_kwargs,
)
query = "What is Cryptocurrency?"
response = chain.run(query)
print(response)

That's close! While I appreciate the enthusiasm, cryptocurrency isn't necessarily "unstoppable." Instead, it's a digital or virtual currency that uses cryptography for security and is decentralized, meaning it's not controlled by any government or financial institution. It operates independently on a network of computers around the world.

Would you like to know more about how cryptocurrencies work?


In [25]:
query = "how does cryptocurrencies work?"
response = chain.run(query)
print(response)

I'd be happy to help you understand what cryptocurrency is.

Cryptocurrency is a form of payment that can be exchanged online for goods and services, similar to arcade tokens or casino chips. You'll need to exchange real currency for it to access the good or service.
