# RecDP LLM - RAG

# Get started

## 1. Install pyrecdp and dependencies

In [None]:
! DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre
! pip install /work/e2eAIOK/RecDP
# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. Prepare your data

In [17]:
%mkdir -p /content/test
# 1. prepare sentence-transformers model
import os
model_root_path = os.path.join(RECDP_MODELS_CACHE, "huggingface")
sentence_model_name= f"{model_root_path}/sentence-transformers/all-mpnet-base-v2"

# 2. prepare output path of faiss database and index name
faiss_output_path = "/content/test/faiss"
faiss_index_name = 'test_index'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## 3. RAG Workflow

### Step 1. Load,Split and Store

In [12]:
from pyrecdp.primitives.operations import *
from pyrecdp.LLM import TextPipeline
from pyrecdp.core.cache_utils import RECDP_MODELS_CACHE
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS
import os
parquet_file="/content/test/parquet"
pipeline = TextPipeline()
ops = [
    Url_Loader(["https://www.intc.com/news-events/press-releases/detail/1655/intel-reports-third-quarter-2023-financial-results"],
                   target_tag='div', target_attrs={'class': 'main-content'}),
    DocumentSplit(text_splitter='RecursiveCharacterTextSplitter', text_splitter_args={}),
    #DocumentSplit(text_splitter='SentenceTransformersTokenTextSplitter', text_splitter_args={'model_name': sentence_model_name}),
    DocumentIngestion(
            vector_store='FAISS',
            vector_store_args={
                "output_dir": faiss_output_path,
                "index": faiss_index_name
            },
            embeddings='HuggingFaceEmbeddings',
            embeddings_args={
                'model_name': sentence_model_name,
            }
        ),
    
]
pipeline.add_operations(ops)
pipeline.execute()

import pandas as pd 
from pandas import option_context

with option_context('display.max_colwidth', 400):
    ds = pd.read_parquet(parquet_file)
    display(ds[["text"]])


init ray
init ray with total mem of 324413575987, total core of 48
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has a

2023-11-22 15:37:00,503	INFO worker.py:1642 -- Started a local Ray instance.


execute with ray started ...


2023-11-22 15:37:04,826	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[FlatMap(<lambda>)->MapBatches(DocEmbedding)]
2023-11-22 15:37:04,827	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-11-22 15:37:04,828	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-11-22 15:37:07,157	INFO actor_pool_map_operator.py:106 -- FlatMap(<lambda>)->MapBatches(DocEmbedding): Waiting for 1 pool actors to start...


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]hes(DocEmbedding)) pid=307178)[0m 
Batches: 100%|██████████| 1/1 [00:13<00:00, 13.08s/it]mbedding)) pid=307178)[0m 
2023-11-22 15:37:27,327	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[FlatMap(<lambda>)->MapBatches(DocEmbedding)]
2023-11-22 15:37:27,328	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-11-22 15:37:27,329	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-11-22 15:37:29,204	INFO actor_pool_map_operator.py:106 -- FlatMap(<lambda>)->MapBatches(DocEmbedding): Waiting for 1 pool actors to start...


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]hes(DocEmbedding)) pid=307938)[0m 


execute with ray took 47.292022357694805 sec


Unnamed: 0,text
0,"Intel Reports Third-Quarter 2023 Financial Results ================================================== [Download as PDF](https://d1io3yog0oux5.cloudfront.net/_68614b7f1d0de99bd7f726f92c573cc8/intel/db/887/8973/earnings_release/Q3+23+EarningsRelease.pdf ""PDF: Intel Reports Third-Quarter 2023 Financial Results"") \r Oct 26, 2023 • 4:01 PM EDT ### Related Documents [Audio](https://..."
1,Q3 2023 Financial Highlights | | | | | | --- | --- | --- | --- | | | **GAAP** | | **Non-GAAP** | | | **Q3 2023** | **Q3 2022** | **vs. Q3 2022** | | **Q3 2023** | **Q3 2022** | **vs. Q3 2022** | | Revenue ($B) | $14.2 | $15.3 | down 8% | | | | | | Gross Margin | 42.5% | 42.6% | down 0.1 ppt | | 45.8% | 45.9% | down 0.1 ppt | | R&D and MG&A ($B) | $5.2 | $6.0 | down 1...
2,"▪ Intel announced that a major customer committed to Intel 18A and Intel 3 with a meaningful pre-payment, allowing the company to accelerate its plans to build two new leading-edge chip factories at its Ocotillo campus in Chandler, Arizona. In addition, IFS and Tower Semiconductor announced an agreement where Intel will provide foundry services and 300 mm manufacturing capacity to help T..."
3,Business Outlook Intel's guidance for the fourth quarter of 2023 includes both GAAP and non-GAAP estimates. Reconciliations between GAAP and non-GAAP financial measures are included below.\* | | | | | | | --- | --- | --- | --- | --- | | **Q4 2023** | | **GAAP\*** | | **Non-GAAP\*** | | Revenue | | $14.6-15.6 billion | | $14.6-15.6 billion^ | | Gross Margin | | 43.3% | | ...
4,"• future production capacity and product supply; • supply expectations, including regarding constraints, limitations, pricing, and industry shortages; • plans and goals related to Intel’s foundry business, including with respect to anticipated customers, future manufacturing capacity and service, technology and IP offerings; • expected timing and impact of acquis..."
5,"Unless specifically indicated otherwise, the forward-looking statements in this release do not reflect the potential impact of any divestitures, mergers, acquisitions, or other business combinations that have not been completed as of the date of this filing. In addition, the forward-looking statements in this release are based on management's expectations as of the date of this release, unless..."
6,"| | | | | --- | --- | --- | | | | **Three Months Ended** | | **(In Millions)** | | **Sep 30, 2023** | | **Jul 2, 2022** | | **Earnings per share of common stock information:** | | | | | | **Weighted average shares of common stock outstanding—basic** | | **4,202** | | **4,118** | | Dilutive effect of employee equity incentive plans | | 27 | | ..."
7,"Intel Corporation Consolidated Condensed Statements of Cash Flows | | | | | --- | --- | --- | | | | **Nine Months Ended** | | **(In Millions; Unaudited)** | | **Sep 30, 2023** | | **Oct 1, 2022** | | | | | | | | **Cash and cash equivalents, beginning of period** | | **$ 11,144** | | **$ 4,827** | | Cash flows provided by (used for) operating ac..."
8,"Intel Corporation Supplemental Operating Segment Results | | | | | --- | --- | --- | | | | **Three Months Ended** | | **(In Millions)** | | **Sep 30, 2023** | | **Oct 1, 2022** | | **Net revenue:** | | | | | | **Client Computing** | | | | | | Desktop | | $ 2,753 | | $ 3,222 | | Notebook | | 4,503 | | 4,408 | | O..."
9,"▪ acquisition-related costs, including amortization and any impairment of acquisition-related intangibles and goodwill. Intel Corporation Explanation of Non-GAAP Measures In addition to disclosing financial results in accordance with US GAAP, this document contains references to the non-GAAP financial measures below. We believe these non-GAAP financial measures provide inves..."


Batches: 100%|██████████| 1/1 [00:12<00:00, 12.95s/it]mbedding)) pid=307938)[0m 


### Step 2 Retrieve

In [16]:

questions = [
    "Has Intel made any organizational structure adjustments?",
    "What is DCAI's revenue in the third quarter and what is the year-on-year situation?",
    "What is Intel's revenue in the third quarter and what is the year-on-year situation?",
]

embeddings = HuggingFaceEmbeddings(model_name=sentence_model_name)
new_db = FAISS.load_local(faiss_output_path, embeddings=embeddings, index_name=faiss_index_name )
result = {}
for question in questions:
    docs = new_db.similarity_search(question)
    result[question] = [doc.page_content for doc in docs]
    
    
for question, answers in result.items():
    print(f"question: {question}" )
    print(f"answers:")
    for answer in answers:
        print(f"    {answer}")
    print("\n")
    

question: Has Intel made any organizational structure adjustments?
answers:
    Unless specifically indicated otherwise, the forward-looking statements in this release do not reflect the potential impact of any divestitures, mergers, acquisitions, or other business combinations that have not been completed as of the date of this filing. In addition, the forward-looking statements in this release are based on management's expectations as of the date of this release, unless an earlier date is specified, including expectations based on third-party information and projections that management believes to be reputable. We do not undertake, and expressly disclaim any duty, to update such statements, whether as a result of new information, new developments, or otherwise, except to the extent that disclosure may be required by law.          About Intel   Intel (Nasdaq: INTC) is an industry leader, creating world-changing technology that enables global progress and enriches lives. Inspired by Mo