In [1]:
import numpy as np
import math
from bs4 import BeautifulSoup
import gensim
from urllib.request import urlopen
import re

### Load Data

In [21]:
def load_text_from_url(url):
    """Loads the texts and apply some preprocessing over html
    to extract the paragraphs from the page
    """
    # load html
    page = urlopen(url)
    content = page.read()
    html = content.decode('utf-8')
    
    # create beautiful parser
    soup = BeautifulSoup(html, 'html.parser')
    
    # extract paragraphs onli
    texts = [x.text for x in soup.find_all('p')]
    return texts

### Cleaning

In [9]:
def is_sanskrit(text, pattern=None):
    # Define the Unicode range for Sanskrit characters
    sanskrit_range = re.compile("[\u0900-\u097F]+", re.UNICODE)
    
    # Check if the text contains Sanskrit characters
    return bool(sanskrit_range.search(text))

def remove_sanskrit_elements(input_list):
    # Use list comprehension to filter out Sanskrit elements
    filtered_list = [element for element in input_list if not is_sanskrit(element)]

    return filtered_list

def has_pattern(text, pattern):
    # Use re.search to check if the pattern is present in the text
    return bool(re.search(pattern, text))

def remove_strings_with_pattern(input_list, pattern):
    # Use list comprehension to filter out strings with the specified pattern
    filtered_list = [element for element in input_list if not has_pattern(element, pattern)]
    return filtered_list


In [24]:
def remove_unwanted(text):
    filtered = remove_sanskrit_elements(text)
    pattern = r"\|"
    filtered = remove_strings_with_pattern(filtered, pattern)
    return filtered    

In [25]:
from gensim.parsing.preprocessing import remove_stopwords

def clean_sentence(sentence, stopwords=False):
    """Cleans the sentences by removing stopwords and other
    signs and symbols. Returns the formatted list of strings.
    """
    formatted = sentence.lower().strip()
    formatted = re.sub(r'[^a-z]', ' ', formatted)
    if stopwords:
        formatted = remove_stopwords(formatted)
    return formatted

In [17]:
def tokenize_text(text):
  """Make all necessary preprocessing of text: cleaning text and tokenizing it
  Useful for queries, not for dataset.
  """
  cleaned = clean_sentence(text, True)
  return [word for word in cleaned.split()]

### main()

In [36]:
url = 'https://www.carakasamhitaonline.com/index.php?title=Deerghanjiviteeya_Adhyaya'
texts = load_text_from_url(url)
filtered = remove_unwanted(texts)
formatted = [clean_sentence(sentence, True) for sentence in filtered]

## FAISS

In [39]:
import faiss
from sentence_transformers import SentenceTransformer

In [40]:
model = SentenceTransformer('all-MiniLM-L12-v2')

.gitattributes: 100%|██████████████████████| 1.18k/1.18k [00:00<00:00, 1.89MB/s]
1_Pooling/config.json: 100%|████████████████████| 190/190 [00:00<00:00, 339kB/s]
README.md: 100%|███████████████████████████| 10.6k/10.6k [00:00<00:00, 5.78MB/s]
config.json: 100%|██████████████████████████████| 573/573 [00:00<00:00, 986kB/s]
config_sentence_transformers.json: 100%|████████| 116/116 [00:00<00:00, 211kB/s]
data_config.json: 100%|█████████████████████| 39.3k/39.3k [00:00<00:00, 193kB/s]
pytorch_model.bin: 100%|██████████████████████| 134M/134M [02:21<00:00, 943kB/s]
sentence_bert_config.json: 100%|█████████████| 53.0/53.0 [00:00<00:00, 94.2kB/s]
special_tokens_map.json: 100%|██████████████████| 112/112 [00:00<00:00, 212kB/s]
tokenizer.json: 100%|█████████████████████████| 466k/466k [00:00<00:00, 678kB/s]
tokenizer_config.json: 100%|████████████████████| 352/352 [00:00<00:00, 736kB/s]
train_script.py: 100%|█████████████████████| 13.2k/13.2k [00:00<00:00, 13.7MB/s]
vocab.txt: 100%|████████████

Create Embeddings

In [42]:
embeddings = np.array([])
for para in formatted:
    encoding = model.encode(para)
    embeddings.append(encoding)

In [45]:
embeddings[0].shape[0]

384

In [54]:
index = faiss.IndexFlatL2(embeddings[0].shape[0])
index.add(np.array(embeddings))

Query the data

In [116]:
query = "how to treat worms"
# query = clean_sentence(query, True)
query_encoding = np.array([model.encode(query)])
D, I = index.search(query_encoding, 6)

Most Similar Paragraph :

In [118]:
most_similar = [filtered[i] for i in I[0]]
most_similar[0]

'The urine of cow is slightly sweet, alleviates discordance of dosha. It cures worms, skin diseases (kushtha) and relieves itching. Its proper intake cures disorders of abdomen. [101]\n'