In [1]:
import os
import sys 
import logging 

''' 
Load files from parent directory (Python Script)
'''
# parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# sys.path.insert(0, parent_dir)
# from elastic_helpers import ESBulkIndexer, ESQueryMaker
# from embedding_model import EmbeddingModel
# from llamaindex_processor import LlamaIndexProcessor
# from nltk_processor import NLTKProcessor
# from chunker import Chunker
# from elastic_config import BASIC_CONFIG
# from llm import LLMProcessor
# from elastic_helpers import ESBulkIndexer
# from elastic_config import BASIC_CONFIG
# sys.path.pop(0)

''' 
Jupyter Notebook
'''
# Get the current working directory
current_dir = os.getcwd()

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

# Add the parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now you can import your modules
from elastic_helpers import ESBulkIndexer, ESQueryMaker
from embedding_model import EmbeddingModel
from llamaindex_processor import LlamaIndexProcessor
from nltk_processor import NLTKProcessor
from chunker import Chunker
from elastic_config import BASIC_CONFIG
from llm import LLMProcessor
from elastic_helpers import ESBulkIndexer
from elastic_config import BASIC_CONFIG

# Remove the parent directory from sys.path if you want to clean up
sys.path.pop(0)

# Goal

Achieve a summarized document profile
Achieve entity and relationship extraction

1. Experiment with textrank for key words, phrases, sentences
2. Experiment with Topic Modeling (LSA, LDA)
3. Experiment with Dependancy parsing and role handling 
4. Implement Semantic Chunking and Combine with Luhn's Algorithm for ranking sentences


# Textrank for extracting key phrases and sentences

In [34]:
import networkx as nx
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from itertools import combinations
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

def get_word_pos(word):
    # Tokenize the word (this step is necessary even for a single word)
    tokens = word_tokenize(word)
    
    # Perform part-of-speech tagging
    tagged = pos_tag(tokens)
    
    # Return the tag (we're assuming there's only one word)
    return tagged[0][1]

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return 'a'
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN

def textrank_phrases(text, top_n=5, phrase_length=2, mode='phrase'):
    lemmatizer = WordNetLemmatizer()

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize sentences into words and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    words = [[lemmatizer.lemmatize(word, pos=get_wordnet_pos(get_word_pos(word))) for word in sentence if word.isalnum() and word not in stop_words] for sentence in words]

    # Part-of-speech tagging
    tagged = [pos_tag(sentence) for sentence in words]

    # Extract nouns and adjectives
    if mode == 'phrase':
        keywords = [[word for word, pos in sentence if pos.startswith('NN') or pos.startswith('JJ')] for sentence in tagged]
    else:  # mode == 'sentence'
        keywords = [' '.join(sentence) for sentence in words]

    # Generate phrases
    if mode == 'phrase':
        phrases = []
        for sentence in keywords:
            phrases.extend([' '.join(phrase) for phrase in zip(*[sentence[i:] for i in range(phrase_length)])])
    else:
        phrases = keywords

    # Build the graph
    graph = nx.Graph()
    if mode == 'phrase':
        # For phrases, use co-occurrence within sentences
        for sentence_phrases in [phrases[i:i+len(keywords[j])-phrase_length+1] for j, i in enumerate([sum(len(k)-phrase_length+1 for k in keywords[:j]) for j in range(len(keywords))])]:
            for pair in combinations(sentence_phrases, 2):
                if graph.has_edge(*pair):
                    graph[pair[0]][pair[1]]['weight'] += 1
                else:
                    graph.add_edge(*pair, weight=1)
    else:
        # For sentences, use similarity between sentences
        for pair in combinations(range(len(phrases)), 2):
            similarity = len(set(phrases[pair[0]].split()) & set(phrases[pair[1]].split())) / \
                         (len(set(phrases[pair[0]].split()) | set(phrases[pair[1]].split())) + 1e-6)
            if similarity > 0:
                graph.add_edge(pair[0], pair[1], weight=similarity)

    # Apply PageRank
    scores = nx.pagerank(graph)

    # Sort by score and return top n
    sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    if mode == 'phrase':
        return [item for item, score in sorted_items[:top_n]]
    else:
        return [sentences[idx] for idx, score in sorted_items[:top_n]]

# Example usage
text=''' 
Expand our customer base by acquiring new customers. Through Elastic Cloud, we provide the fastest and easiest way to get started with a
free trial. However, there is no free subscription tier in Elastic Cloud. Self-managed users can easily download our software directly from our
website and access many features free of charge, which also facilitates adoption. Our sales and marketing team conducts campaigns to drive
further awareness and adoption within the user community. As a result, many of our sales prospects are already familiar with our technology prior
to entering into a commercial relationship with us. Additionally, we leverage our network of partners to drive awareness and expand our sales and
marketing reach to target new customers. We will continue to engage our community and our partners to drive awareness and to invest in our
sales and marketing team to grow our customer base.
• Expand within our existing customer base through new use cases and larger deployments. We view initial success with our products as a
path to drive expansion to new use cases and projects and larger deployments within organizations. We often enter an organization through a
single developer or a small team for an initial project or use case with an objective to quickly solve a technical challenge or business problem.
Because of the rapid success with our products, knowledge of Elastic often spreads within an organization to new teams of developers, architects,
IT operations personnel, security personnel, and senior executives. We will continue to invest in helping users and customers be successful with
our products.
• Extend our product leadership through continued investment in our technology. We will continue to invest in our products and services to
extend into new use cases, industries, geographies, and customers. We regularly deliver new and enhanced capabilities to our customers through
regular releases, to which everyone has access based on our subscription model. Our technology investments within the Elastic Stack include
foundational capabilities as well as solution enhancements for our target use cases.
• Expand our strategic and regional partnerships. Our partners assist us in driving awareness of Elastic and our products, using the Elastic Stack
to solve customer pain points, and extending our reach in geographic areas and verticals where we do not have a formal sales presence. We have a
diverse range of partners and we will continue to pursue partnerships to further the development of the Elastic Stack and our customer reach.
• Selectively pursue strategic acquisitions. Since inception, we have selectively pursued strategic acquisitions to drive product and market
expansion. The focus of our most recent acquisitions has been to enhance the technology underlying our Security and Observability offerings. We
intend to continue to pursue acquisitions selectively.
'''

print("Top 10 phrases:")
top_phrases = textrank_phrases(text, top_n=10, phrase_length=3, mode='phrase')
for i, phrase in enumerate(top_phrases, 1):
    print(f"{i}. {phrase}")

print("\nTop 5 sentences:")
top_sentences = textrank_phrases(text, top_n=5, mode='sentence')
for i, sentence in enumerate(top_sentences, 1):
    print(f"{i}. {sentence}")

Top 10 phrases:
1. new use case
2. partner drive awareness
3. sale marketing team
4. customer base new
5. view initial success
6. initial success product
7. success product path
8. product path drive
9. path drive expansion
10. drive expansion new

Top 5 sentences:
1. • Expand within our existing customer base through new use cases and larger deployments.
2. We will continue to invest in our products and services to
extend into new use cases, industries, geographies, and customers.
3. We will continue to engage our community and our partners to drive awareness and to invest in our
sales and marketing team to grow our customer base.
4. Additionally, we leverage our network of partners to drive awareness and expand our sales and
marketing reach to target new customers.
5. Our partners assist us in driving awareness of Elastic and our products, using the Elastic Stack
to solve customer pain points, and extending our reach in geographic areas and verticals where we do not have a formal sal

[nltk_data] Downloading package punkt to /Users/han/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/han/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/han/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/han/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Topic Modeling

In [22]:
def process_ner_output(ner_output):
    entities = []
    current_entity = ""
    
    for item in ner_output:
        if item['entity'].startswith('B-'):
            if current_entity:
                entities.append(current_entity.strip())
            current_entity = item['word']
        elif item['entity'].startswith('I-'):
            if item['word'].startswith('##'):
                current_entity += item['word'][2:]  # Remove '##' prefix
            else:
                current_entity += ' ' + item['word']
    
    if current_entity:
        entities.append(current_entity.strip())
    
    # Remove duplicates and sort
    unique_entities = sorted(set(entities))
    
    return unique_entities

In [32]:
test=''' 
Expand our customer base by acquiring new customers. Through Elastic Cloud, we provide the fastest and easiest way to get started with a
free trial. However, there is no free subscription tier in Elastic Cloud. Self-managed users can easily download our software directly from our
website and access many features free of charge, which also facilitates adoption. Our sales and marketing team conducts campaigns to drive
further awareness and adoption within the user community. As a result, many of our sales prospects are already familiar with our technology prior
to entering into a commercial relationship with us. Additionally, we leverage our network of partners to drive awareness and expand our sales and
marketing reach to target new customers. We will continue to engage our community and our partners to drive awareness and to invest in our
sales and marketing team to grow our customer base.
• Expand within our existing customer base through new use cases and larger deployments. We view initial success with our products as a
path to drive expansion to new use cases and projects and larger deployments within organizations. We often enter an organization through a
single developer or a small team for an initial project or use case with an objective to quickly solve a technical challenge or business problem.
Because of the rapid success with our products, knowledge of Elastic often spreads within an organization to new teams of developers, architects,
IT operations personnel, security personnel, and senior executives. We will continue to invest in helping users and customers be successful with
our products.
• Extend our product leadership through continued investment in our technology. We will continue to invest in our products and services to
extend into new use cases, industries, geographies, and customers. We regularly deliver new and enhanced capabilities to our customers through
regular releases, to which everyone has access based on our subscription model. Our technology investments within the Elastic Stack include
foundational capabilities as well as solution enhancements for our target use cases.
• Expand our strategic and regional partnerships. Our partners assist us in driving awareness of Elastic and our products, using the Elastic Stack
to solve customer pain points, and extending our reach in geographic areas and verticals where we do not have a formal sales presence. We have a
diverse range of partners and we will continue to pursue partnerships to further the development of the Elastic Stack and our customer reach.
• Selectively pursue strategic acquisitions. Since inception, we have selectively pursued strategic acquisitions to drive product and market
expansion. The focus of our most recent acquisitions has been to enhance the technology underlying our Security and Observability offerings. We
intend to continue to pursue acquisitions selectively.
'''
pipe(test)

[{'entity': 'B-ORG',
  'score': 0.8070242,
  'index': 13,
  'word': 'El',
  'start': 63,
  'end': 65},
 {'entity': 'B-ORG',
  'score': 0.79614115,
  'index': 14,
  'word': '##astic',
  'start': 65,
  'end': 70},
 {'entity': 'I-ORG',
  'score': 0.67202395,
  'index': 15,
  'word': 'Cloud',
  'start': 71,
  'end': 76},
 {'entity': 'B-ORG',
  'score': 0.8062292,
  'index': 43,
  'word': 'El',
  'start': 198,
  'end': 200},
 {'entity': 'B-ORG',
  'score': 0.8110485,
  'index': 44,
  'word': '##astic',
  'start': 200,
  'end': 205},
 {'entity': 'I-ORG',
  'score': 0.4768625,
  'index': 45,
  'word': 'Cloud',
  'start': 206,
  'end': 211}]

In [24]:
process_ner_output(pipe(test))

['##astic Cloud', 'El']

# Step One: Initialize services, Initial Setup

In [4]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

# Initialize Embedding Model
HUGGINGFACE_EMBEDDING_MODEL = os.environ.get('HUGGINGFACE_EMBEDDING_MODEL')
embedder=EmbeddingModel(model_name=HUGGINGFACE_EMBEDDING_MODEL)

# Initialize Document Processor
docprocessor=DocumentProcessor()

# Initialize Elasticsearch
ELASTIC_CLOUD_ID = os.environ.get('ELASTIC_CLOUD_ID')
ELASTIC_USERNAME = os.environ.get('ELASTIC_USERNAME')
ELASTIC_PASSWORD = os.environ.get('ELASTIC_PASSWORD')
ELASTIC_CLOUD_AUTH = (ELASTIC_USERNAME, ELASTIC_PASSWORD)
es_bulk_indexer = ESBulkIndexer(cloud_id=ELASTIC_CLOUD_ID, credentials=ELASTIC_CLOUD_AUTH)
es_query_maker = ESQueryMaker(cloud_id=ELASTIC_CLOUD_ID, credentials=ELASTIC_CLOUD_AUTH)

# Define Index Name
index_name="test"

# Initialize LLM
llm = LLMProcessor()

Using MPS


INFO:elastic_helpers:Connection created for cloud_id: Training:YXNpYS1zb3V0aGVhc3QxLmdjcC5lbGFzdGljLWNsb3VkLmNvbTo0NDMkOTVmMWQ4MTZiNWEzNGM3NTljNjEwMWIyODU4ZGRmZGEkZmE0NzUyN2UwN2NkNGUwYThiOGFkMGViZDc2ZDE4NGY=
INFO:elastic_helpers:Connection created for cloud_id: Training:YXNpYS1zb3V0aGVhc3QxLmdjcC5lbGFzdGljLWNsb3VkLmNvbTo0NDMkOTVmMWQ4MTZiNWEzNGM3NTljNjEwMWIyODU4ZGRmZGEkZmE0NzUyN2UwN2NkNGUwYThiOGFkMGViZDc2ZDE4NGY=
INFO:llm:LLMProcessor initialized with model: gpt-4o


## Look into the use of semantic chunking
https://docs.llamaindex.ai/en/stable/examples/node_parsers/semantic_chunking/

In [5]:

documents=docprocessor.load_documents('./documents/')
documents=[dict(doc_obj) for doc_obj in documents]

In [8]:
documents[52]

{'id_': 'b5907fc5-5b95-4527-9a39-8cd9e8816a74',
 'embedding': None,
 'metadata': {'page_label': '53',
  'file_name': 'elastic_annual_report_2023.pdf',
  'file_path': '/Users/han/Desktop/Projects/basics/graphRag/documents/elastic_annual_report_2023.pdf',
  'file_type': 'application/pdf',
  'file_size': 1492237,
  'creation_date': '2024-07-18',
  'last_modified_date': '2024-07-18'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'text': 'Table of Contents\nPART II\nItem 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities.\nMarket Information for Ordinary Shares\nOur ordinary shares began trading on the NYSE under the symbol “ESTC” on October 5, 2018. Prior to that date,

# DEPRECATED - DOCRAG STUFF BELOW

# Step Two: Process some documents

In [3]:

documents=docprocessor.load_documents('./documents/')
documents=[dict(doc_obj) for doc_obj in documents]
chunked_documents=docprocessor.chunk_documents(documents)



In [4]:
chunked_documents[0]

{'id_': 'aa2ff1ea-7f3b-4ec4-be74-e88261bfaf88',
 'chunk': 'Optimizing Search Engines using Clickthrough Data Thorsten Joachims Cornell University Department of Computer Science Ithaca, NY 14853 USA tj@cs.cornell.edu ABSTRACT This paper presents an approach to automatically optimiz- ing the retrieval quality of search engines using clickthrough data. Intuitively, a good information retrieval system should present relevant documents high in the ranking, with less relevant documents following below. While previous ap- proaches to learning retrieval functions from examples exist, they typically require training data generated from relevance judgments by experts. This makes them diﬃcult and ex- pensive to apply. The goal of this paper is to develop a method that utilizes clickthrough data for training, namely the query-log of the search engine in connection with the log of links the users clicked on in the presented ranking. Such clickthrough data is available in abundance and can be record

In [7]:
chunked_documents[1]

{'id_': '536090e8-e993-4869-8a0c-c7bfc754df27',
 'chunk': 'Google in terms of retrieval quality after only a couple of hundred training examples. 1. INTRODUCTION Which WWW page(s) does a user actually want to re- trieve when he types some keywords into a search engine? There are typically thousands of pages that contain these words, but the user is interested in a much smaller subset. One could simply ask the user for feedback. If we knew the set of pages actually relevant to the user’s query, we could use this as training data for optimizing (and even personal- izing) the retrieval function. Unfortunately, experience shows that users are only rarely willing to give explicit feedback. However, this paper argues that suﬃcient information is already hidden in the logﬁles of WWW search engines. Since major search engines re- Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distri

# Step Three: Embed your documents

In [20]:
embedded_docs=embedder.embed_documents(chunked_documents)

Embedding documents: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s]


In [21]:
embedded_docs[0]

{'id_': '5edcf2e4-36c2-4e57-8231-ed33053400a0',
 'chunk': 'Optimizing Search Engines using Clickthrough Data Thorsten Joachims Cornell University Department of Computer Science Ithaca, NY 14853 USA tj@cs.cornell.edu ABSTRACT This paper presents an approach to automatically optimiz- ing the retrieval quality of search engines using clickthrough data. Intuitively, a good information retrieval system should present relevant documents high in the ranking, with less relevant documents following below. While previous ap- proaches to learning retrieval functions from examples exist, they typically require training data generated from relevance judgments by experts. This makes them diﬃcult and ex- pensive to apply. The goal of this paper is to develop a method that utilizes clickthrough data for training, namely the query-log of the search engine in connection with the log of links the users clicked on in the presented ranking. Such clickthrough data is available in abundance and can be record

In [22]:
embedded_docs[1]

{'id_': '536090e8-e993-4869-8a0c-c7bfc754df27',
 'chunk': 'Google in terms of retrieval quality after only a couple of hundred training examples. 1. INTRODUCTION Which WWW page(s) does a user actually want to re- trieve when he types some keywords into a search engine? There are typically thousands of pages that contain these words, but the user is interested in a much smaller subset. One could simply ask the user for feedback. If we knew the set of pages actually relevant to the user’s query, we could use this as training data for optimizing (and even personal- izing) the retrieval function. Unfortunately, experience shows that users are only rarely willing to give explicit feedback. However, this paper argues that suﬃcient information is already hidden in the logﬁles of WWW search engines. Since major search engines re- Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distri

# Step Four: Index your documents in Elastic Search

In [8]:

index_exists = es_bulk_indexer.check_index_existence(index_name=index_name)
if not index_exists:
    logger.info(f"Creating new index: {index_name}")
    es_bulk_indexer.create_es_index(es_configuration=BASIC_CONFIG, index_name=index_name)

success_count = es_bulk_indexer.bulk_upload_documents(
    index_name=index_name, 
    documents=embedded_docs, 
    id_col='id_',
    batch_size=32
)

# Step Five: Make a Query


In [4]:
def get_context(index_name, query_text, fields, num_candidates=100, num_results=20, text_field="chunk", embedding_field="embedding"):
    embedding=embedder.get_embeddings([query_text])

    results, search_body = es_query_maker.hybrid_vector_search(
        index_name=index_name,
        query_text=query_text,
        query_vector=embedding[0][0][0],
        text_field=text_field,
        vector_field=embedding_field,
        num_candidates=num_candidates,
        num_results=num_results
    )
    context_docs=['\n\n'.join([field+":\n\n"+j['_source'][field] for field in fields]) for j in results['hits']['hits']]
    context_docs.reverse()
    return context_docs, search_body


In [9]:
query_text="Why would we use an SVM for learning ranking functions?"
'''
Generate an effective elastic search query from the query text
'''
elastic_search_query=await llm.generate_query(query_text)
fields=['chunk']
'''
Parse the generated query into an Elastic search query, and use it to search your elastic index
'''
context, search_body=get_context(index_name, elastic_search_query, fields)
'''
Pass the context and original query to Elastic and get back a generated answer.
'''
answer=await llm.basic_qa(query=query_text, context=context)

In [6]:
print(answer)

According to the provided context, there are several reasons for using a Support Vector Machine (SVM) for learning ranking functions, particularly in information retrieval scenarios. Here are the key points:

### Handling Ranking Problems
1. **Compatible with Partial Preference Data**:
   - The context states, "The Ranking SVM can be trained with partial preference data," which is useful for scenarios like recommender systems or clickthrough data from search engines, where absolute relevance judgments are not available.

### Optimization Framework
2. **Empirical Risk Minimization**:
   - SVMs can be adapted to minimize empirical risk on ranking problems. For instance, the provided text mentions, "the problem of learning a ranking function over a finite domain in terms of empirical risk minimization," and the Ranking SVM algorithm is presented to handle this.
   
### Convex Optimization
3. **Convex Programming**:
   - The optimization problem associated with ranking SVM is convex and ha

## GPT-4o generates this string

In [7]:
print(elastic_search_query)

SVM OR support vector machine OR learning ranking functions OR rank learning OR machine learning ranking OR ranking algorithms OR SVM ranking OR support vector machine ranking OR ranking SVM OR ranking models OR learning to rank


## Our parser in elastic_helpers.py turns the string into a nice ElasticSearch Query

In [13]:
search_body

{'knn': {'field': 'embedding',
  'query_vector': [0.20159578323364258,
   0.13381077349185944,
   -0.07545126974582672,
   0.08422087132930756,
   0.21016280353069305,
   0.5059270858764648,
   -0.21361607313156128,
   0.43185871839523315,
   0.29655686020851135,
   -0.0798199325799942,
   0.1231250986456871,
   -0.17215044796466827,
   0.07292104512453079,
   0.277828186750412,
   0.3762294352054596,
   0.19149670004844666,
   -0.5201199650764465,
   -0.13477034866809845,
   0.34850385785102844,
   0.35124170780181885,
   0.500167727470398,
   -0.27604854106903076,
   0.45695948600769043,
   -0.14235427975654602,
   0.20682553946971893,
   0.9613552093505859,
   -0.24632678925991058,
   -0.40995097160339355,
   0.005000472068786621,
   -1.2488564252853394,
   0.24307891726493835,
   0.12241682410240173,
   -0.05846923589706421,
   0.05866442620754242,
   0.044377997517585754,
   0.20587921142578125,
   -0.1291496455669403,
   0.2728227376937866,
   -0.31446218490600586,
   0.518176794