In [1]:
!pip install langchain
!pip install torch
!pip install sentence_transformers
!pip install faiss-cpu
!pip install huggingface-hub
!pip install pypdf
!pip -q install accelerate
!pip install llama-cpp-python
!pip -q install git+https://github.com/huggingface/transformers


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFDirectoryLoader

In [3]:
#load pdf files
loader = PyPDFDirectoryLoader("/content/")
data = loader.load()

In [4]:
print(data)

[Document(page_content="Dataminingisaprocessofdiscoveringhiddenpatterns,relationships,andvaluableinsightsfrom\nlargedatasets.Itinvolvesvarioustechniquesandmethodstoextractmeaningfulinformationfrom\ndata.Here'sacomprehensiveoverviewofdatamining:\nDefinition:Dataminingistheprocessofanalyzingandextractinginformationfromlargedatasetsto\nuncoverpatterns,correlations,trends,andinsightsthatarenotreadilyapparent.\nDataSources:\nStructuredData:Thistypeofdataisorganizedintorowsandcolumns,likedatainrelational\ndatabases.Itincludesnumericaldata,categoricaldata,anddates.\nSemi-StructuredData:Datathatdoesn'tconformtoarigidstructurebuthassomelevelof\norganization,suchasXMLorJSONfiles.\nUnstructuredData:Thisincludestext,images,audio,andvideo.Extractingmeaningfulinformation\nfromunstructureddatacanbechallengingbutisacrucialpartofdatamining.\nDataMiningGoals:\nPredictiveModelinginvolvesbuildingmodelstopredictfutureoutcomes.Forexample,predicting\ncustomerchurninatelecomcompany.\nDescriptiveModelingfocuse

In [5]:
#Step 05: Split the Extracted Data into Text Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=50)

text_chunks = text_splitter.split_documents(data)


In [6]:
len(text_chunks)

12

In [7]:
#get the third chunk
text_chunks[2]

Document(page_content='DataReduction:Reducingthevolumebutproducingthesameorsimilaranalyticalresults,for\nexample,byusingsamplingoraggregationtechniques.\nAlgorithms:\nDecisionTrees:Thesearetree-likemodelsthatmakedecisionsbysplittingthedataintosubsets\nbasedoninputfeatures.\nK-MeansClustering:Usedtogroupdatapointsintoclustersbasedonsimilarity.\nNeuralNetworks:Aclassofmachinelearningalgorithmsinspiredbythestructureofthehumanbrain,\noftenusedindeeplearningfortaskslikeimageandspeechrecognition.\nSupportVectorMachines(SVM):Asupervisedlearningalgorithmusedforclassificationand\nregressiontasks.\nAprioriAlgorithm:Aclassicalgorithmforassociationrulemining,oftenusedinmarketbasketanalysis.\nDataMiningProcess:\nDataSelection:Carefulselectionofrelevantdatasources,whichcaninvolveintegratingdatafrom\nmultipledatabasesandexternalsources.\nDataCleaningandPreprocessing:Asmentionedearlier,thisstepiscrucialtoensuredataquality.', metadata={'source': '/content/data_mining.pdf', 'page': 0})

In [8]:
#Step 06:Downlaod the Embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [9]:
#Step 08: Create Embeddings for each of the Text Chunk
vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)

In [10]:
!wget "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf"

--2023-10-22 16:47:09--  https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 18.172.134.24, 18.172.134.124, 18.172.134.88, ...
Connecting to huggingface.co (huggingface.co)|18.172.134.24|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/46/12/46124cd8d4788fd8e0879883abfc473f247664b987955cc98a08658f7df6b826/14466f9d658bf4a79f96c3f3f22759707c291cac4e62fea625e80c7d32169991?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.1.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.1.Q4_K_M.gguf%22%3B&Expires=1698248873&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5ODI0ODg3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy80Ni8xMi80NjEyNGNkOGQ0Nzg4ZmQ4ZTA4Nzk4ODNhYmZjNDczZjI0NzY2NGI5ODc5NTVjYzk4YTA4NjU4ZjdkZjZiODI2LzE0NDY2Zjl

In [11]:
#Import Model
llm = LlamaCpp(
    streaming = True,
    model_path="/content/mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    temperature=0.75,
    top_p=1,
    verbose=True,
    n_ctx=4096
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [12]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever(search_kwargs={"k": 2}))

In [13]:
query = "What is data mining?"

In [14]:
qa.run(query)

' Data mining is the process of discovering patterns and knowledge from large amounts of data using various techniques such as predictive modeling, descriptive modeling, pattern discovery, anomaly detection, data preprocessing, and open-source data mining tools. These techniques are used in various fields like healthcare, finance, etc., to achieve specific goals like predicting disease outbreaks, personalizing treatment plans, credit scoring, fraud detection, algorithmic trading, etc. Data mining helps organizations make informed decisions based on their data.'

In [18]:
import sys

while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    break
  if user_input == '':
    continue
  result = qa({'query': user_input})
  print(f"Answer: {result['result']}")

Input Prompt: what is data preprocessing?


Llama.generate: prefix-match hit


Answer:  Data preprocessing is a step in the data mining process that involves cleaning and preparing raw data for analysis. It includes tasks such as data normalization, data integration, data reduction, missing value imputation, outlier detection and removal, and variable selection. Data preprocessing helps to improve the quality of the data and make it more suitable for analysis, which can lead to better insights and more accurate results.
Input Prompt: exit
Exiting
