# From llama-index to DPK - Importing Gmails emails example

### This notebook shows how llama-index documents can be processed by DPK.

To execute this notebook, ensure the credentials.json file is located in the notebook directory. 
For instructions on creating this file, please visit: LlamaHub Google Reader Guide (https://llamahub.ai/l/readers/llama-index-readers-google?from=)

In [None]:
%pip install -qq -r requirements.txt

In [None]:
%pip install -qq -r dpk-requirements.txt

In [None]:
%pip install -qq llama-index

In [None]:
%pip install -qq llama-index-readers-google
%pip install -qq llama-index google-auth google-auth-oauthlib google-auth-httplib2

In [None]:
import shutil
import os
cwd = os.getcwd()

output_base_path = f"{cwd}/output"

output_folder =  f"{output_base_path}/llama_index/"

shutil.rmtree(output_base_path, ignore_errors=True)
print (f"✅ Cleared {output_folder} directory")
os.mkdir(output_base_path)
os.mkdir(output_folder)

## Ingest emails from Gmail

In [None]:
from llama_index.readers.google import GmailReader

reader = GmailReader(query="", results_per_page=1000, service=None)


In [None]:
# Load data
documents = reader.load_data()

## Convert llama index documents to parquet file

In [None]:
# convert llama-index documents to parquet files
import pandas as pd
idx=0
data={}
data["contents"]=[]
data["llama_index_metadata"]=[]
data["llama_index_doc_id"]=[]
data["snippet"]=[]
idx=0
for document in documents:
    data["llama_index_metadata"].append(document.metadata)
    data["llama_index_doc_id"].append(document.doc_id)
    data["contents"].append(document.text)
    data["snippet"].append(document.metadata["snippet"])
    idx=idx+1
df = pd.DataFrame.from_dict(data)

## Save the parquet file

In [None]:
df.to_parquet(output_folder+'out.parquet')

## Print the output

In [None]:
df

## Run DPK Doc ID

In [None]:
from dpk_doc_id.transform_python import DocID
DocID(input_folder= output_folder,
        output_folder= output_folder+"docid",
        doc_id_doc_column= "contents",
        doc_id_hash_column= "document_id",
        doc_id_int_column= "int_id_column",
        doc_id_start_id= 5).transform()


## Run DPK fdedup

In [None]:
from dpk_fdedup.transform_python import Fdedup
Fdedup(input_folder=output_folder+"docid",
    output_folder=output_folder+"fdedup",
    contents_column= "contents",
    document_id_column= "int_id_column",
    num_permutations= 112,
    num_bands= 14,
    num_minhashes_per_band= 8,
    operation_mode="filter_duplicates").transform()

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
!wget -O 'my_utils.py'  'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'

In [None]:
from my_utils import read_parquet_files_as_df

output_df = read_parquet_files_as_df(output_folder+'fdedup/cleaned')

output_df

## Apply DPK Doc quality

In [None]:
from dpk_doc_quality.transform_python import DocQuality
DocQuality(input_folder=output_folder+"fdedup/cleaned",
            output_folder= output_folder+'docq',
            docq_text_lang = "en",
            docq_doc_content_column ="contents").transform()


## Print the output

In [None]:
from my_utils import read_parquet_files_as_df

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

output_df = read_parquet_files_as_df(output_folder+'docq')

output_df[['contents','docq_contain_bad_word']]

output_df

## Apply DPK Filtering¶

In [None]:
# remove rows with bad words
from dpk_filter.transform_python import Filter
Filter(input_folder= output_folder+'docq',
        output_folder= output_folder+'filter',
        filter_criteria_list= [
            "NOT docq_contain_bad_word",
        ],
       ).transform()

## Print the output

In [None]:
from my_utils import read_parquet_files_as_df

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

output_df = read_parquet_files_as_df(output_folder+'filter')

output_df[['contents','docq_contain_bad_word']]

output_df

## Convert parquet rows back to llama-index Documents

In [None]:
from llama_index.core import Document as LIDocument
from llama_index.core import Document, SummaryIndex
from llama_index.core.node_parser import SimpleNodeParser

output_df = read_parquet_files_as_df(output_folder+'filter')

docs = []
rows = output_df.to_records()
for row in rows:
    docs.append(LIDocument(doc_id=row["llama_index_doc_id"], text=row["contents"], metadata=row["llama_index_metadata"]))    

## Create the llama-index nodes

In [None]:
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(docs)
num=len(nodes)
print(f"number of nodes: {num}")

## Convert llama-index nodes to rows in parquet table

In [None]:
# convert llama-index documents to parquet files
import pandas as pd

# convert llama-index node metedata keys to table columns
idx=0
data={}
data["contents"]=[]
data["llama_index_metadata"]=[]
idx=0
for node in nodes:
    # convert node metadata keys to columns
    data["llama_index_metadata"].append(document.metadata)
    data["contents"].append(node.text)
    idx=idx+1
df = pd.DataFrame.from_dict(data)

## Print the output

In [None]:
df

## Save the output to parquet file

In [None]:
shutil.rmtree(output_folder+'nodes/', ignore_errors=True)
print (f"✅ Cleared {output_folder}/nodes/ directory")
os.mkdir(output_folder+'nodes/')
df.to_parquet(output_folder+'nodes/out.parquet')

### Apply DPK text encoding

In [None]:
%pip install -qq -U ipywidgets

In [None]:
from dpk_text_encoder.transform_python import TextEncoder
from data_processing.utils import GB

EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'

x=TextEncoder(input_folder= output_folder+'nodes', 
               output_folder= output_folder+'encoding', 
               text_encoder_model_name = EMBEDDING_MODEL).transform()

In [None]:
from my_utils import read_parquet_files_as_df

output_df = read_parquet_files_as_df(output_folder+'encoding')

print ("Output data dimensions (rows x columns)= ", output_df.shape)

output_df

## Load Processed Data into Vector Database
### ref: https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb

### Load Parquet Data

In [None]:
import pandas as pd
import glob

print ('Loading data from : ', output_folder+'encoding')

data_df = read_parquet_files_as_df(output_folder+'encoding')

print (f"\nTotal number of rows = {data_df.shape[0]}")

In [None]:
## Shape the data

EMBEDDING_LENGTH =  len(data_df.iloc[0]['embeddings'])
print ('embedding length: ', EMBEDDING_LENGTH)

data={}
data["embeddings"]=data_df['embeddings'].tolist()
data["text"]=data_df["contents"].tolist()
data["metadata"]=data_df["llama_index_metadata"].tolist()
data
df_new = pd.DataFrame(data)

In [None]:
data_list=[]
# rename 'embeddings' columns as 'vector' to match default schema
for index, row in df_new.iterrows():
    data_list.append({
        "vector": row['embeddings'],
        "text": row['text'],
        "metadata": row["metadata"]
    })
data_list

### Connect to Vector Database

In [None]:
%pip install -qq -U pymilvus
%pip install -qq pymilvus[model]

In [None]:
DB_URI = './rag_1_dpk.db'  # For embedded instance
#MY_CONFIG.DB_URI = 'http://localhost:19530'  # For Docker instance
COLLECTION_NAME = 'test'

In [None]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(DB_URI)

print ("✅ Connected to Milvus instance:", DB_URI)

### Create A Collection

In [None]:
if milvus_client.has_collection(collection_name=COLLECTION_NAME):
    milvus_client.drop_collection(collection_name=COLLECTION_NAME)
    print ('✅ Cleared collection :', COLLECTION_NAME)


milvus_client.create_collection(
    collection_name=COLLECTION_NAME,
    dimension=EMBEDDING_LENGTH,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
    auto_id=True
)
print ("✅ Created collection :", COLLECTION_NAME)

In [None]:
res = milvus_client.insert(collection_name=COLLECTION_NAME, data=data_list)

print('inserted # rows', res['insert_count'])

milvus_client.get_collection_stats(COLLECTION_NAME)

## Do vector_search.
### ref: https://github.com/IBM/data-prep-kit/blob/765d7afdb39a7c44987453b6c80b9888f3ac0574/examples/notebooks/rag/rag_1C_vector_search.ipynb

## Setup Embeddings

In [None]:
## Option 1 - use sentence transformers directly

# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(EMBEDDING_MODEL)

def get_embeddings (str):
    embeddings = embedding_model.encode(str, normalize_embeddings=True)
    return embeddings

In [None]:
## Option 2 - Milvus model
from pymilvus import model

# embedding_fn = model.DefaultEmbeddingFunction()

## initialize the SentenceTransformerEmbeddingFunction
embedding_fn = model.dense.SentenceTransformerEmbeddingFunction(
    model_name = EMBEDDING_MODEL,
    device='cpu' # this will work on all devices (KIS)
)

In [None]:
# Test Embeddings
text = 'Paris 2024 Olympics'
embeddings = get_embeddings(text)
print ('sentence transformer : embeddings len =', len(embeddings))
print ('sentence transformer : embeddings[:5] = ', embeddings[:5])

embeddings = embedding_fn([text])
print ('milvus model wrapper : embeddings len =', len(embeddings[0]))
print ('milvus model wrapper  : embeddings[:5] = ', embeddings[0][:5])

## Do A Vector Search

In [None]:
import random


## helper function to perform vector search
def  do_vector_search (query):
    query_vectors = [get_embeddings(query)]  # Option 1 - using sentence transformers
    # query_vectors = embedding_fn([query])  # using Milvus model 

    results = milvus_client.search(
        collection_name=COLLECTION_NAME,  # target collection
        data=query_vectors,  # query vectors
        limit=5,  # number of returned entities
        output_fields=["text"],  # specifies fields to be returned
    )
    return results
## ----

def  print_search_results (results):
    # pprint (results)
    print ('num results : ', len(results[0]))

    for i, r in enumerate (results[0]):
        #pprint(r, indent=4)
        print (f'------ result {i+1} --------')
        print(r)
        print()

In [None]:
query = "Security alert"

results = do_vector_search (query)
print_search_results(results)