# Information Retrival - Testing on subset

In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install gensim



In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

## One Page scrapping

In [None]:
url = 'https://www.carakasamhitaonline.com/index.php?title=Deerghanjiviteeya_Adhyaya'

In [None]:
page = urlopen(url)

In [None]:
content = page.read()

In [None]:
html = content.decode('utf-8')
html.find('<title>'), html.find('</title')

(95, 127)

In [None]:
html[95+7:119]

'Deerghanjiviteeya'

In [None]:
len(re.findall('<p>', html))

1003

### Beautiful Soup

In [None]:
soup = BeautifulSoup(html, 'html.parser')

In [None]:
# print(soup.get_text())
soup.title.string

'Deerghanjiviteeya Adhyaya'

In [None]:
texts = [x.text for x in soup.find_all('p')]

# **Cleaning**

#### Removing sanskrit

In [None]:
def is_sanskrit(text):
    # Define the Unicode range for Sanskrit characters
    sanskrit_range = re.compile("[\u0900-\u097F]+", re.UNICODE)

    # Check if the text contains Sanskrit characters
    return bool(sanskrit_range.search(text))

In [None]:

def remove_sanskrit_elements(input_list):
    # Use list comprehension to filter out Sanskrit elements
    filtered_list = [element for element in input_list if not is_sanskrit(element)]

    return filtered_list

In [None]:
filtered = remove_sanskrit_elements(texts)

In [None]:
def has_pattern(text, pattern):
    # Use re.search to check if the pattern is present in the text
    return bool(re.search(pattern, text))

def remove_strings_with_pattern(input_list, pattern):
    # Use list comprehension to filter out strings with the specified pattern
    filtered_list = [element for element in input_list if not has_pattern(element, pattern)]

    return filtered_list

In [None]:
filtered2 = remove_strings_with_pattern(filtered, r"\|")

#### Removing spaces, stopwords

In [None]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords


In [None]:
def clean_sentence(sentence, stopwords=False):
  sentence = sentence.lower().strip()
  sentence = re.sub(r'[^a-z]', ' ', sentence)
  if stopwords:
    sentence = remove_stopwords(sentence)
  return sentence
clean_sentence(filtered2[74], stopwords=True)

'urine buffalo provides relief piles edema disorders leading general enlargement abdomen udara urine elephant salty provides relief worms dermatoses kushtha useful treatment retention feces urine poisoning kapha disorders piles'

In [None]:
filtered3 = [clean_sentence(para, True) for para in filtered2]

In [None]:
len(texts),len(filtered), len(filtered2), len(filtered3)

(1004, 717, 149, 149)

In [None]:
def tokenize_text(text):
  """Make all necessary preprocessing of text: cleaning text and tokenizing it"""


  cleaned = clean_sentence(text, True)
  return [word for word in cleaned.split()]

## Inverted Index

In [None]:
def inverted_index(words):
  """
  create inverted index of words from list of terms
  """
  inverted = {}
  for index, word in enumerate(words):
    location = inverted.setdefault(word, [])
    location.append(index)
  return inverted

def inverted_index_add(inverted, doc_id, doc_index):
  for word in doc_index.keys():
    locations = doc_index[word]
    indices = inverted.setdefault(word, {})
    indices[doc_id] = locations
  return inverted

In [None]:
inverted_doc_indexes = {}
para_with_index={}
doc_id = 0
for index, para in enumerate(filtered3):
  words = para.split()
  para_with_index[index] = words

  doc_index = inverted_index(words)

  inverted_index_add(inverted_doc_indexes, doc_id, doc_index)
  doc_id += 1

In [None]:
inverted_doc_indexes['piles']

{68: [24], 74: [4, 28], 75: [8], 85: [10]}

In [None]:
para_with_index[41][2]

'cold'

In [None]:
filtered2[85]

'The barks of Putika and Tilvaka are used for purgation therapy. Indications of bark of Krishnagandha are erysipelas, inflammation, piles, ringworm, abscess, nodules, dermatosis and gangrene (alaji). The wise physician should also acquire the knowledge of the above mentioned six plants (three with latex and three with bark) of their pacification actions. [117-118]\n'

### Bag of Words

In [None]:
from gensim import corpora

In [None]:
sentences = filtered3
sentence_words = [[word for word in document.split()] for document in sentences]
dictionary = corpora.Dictionary(sentence_words)
corpus = [dictionary.doc2bow(text) for text in sentence_words]
# for sent, embedding in zip(sentences, corpus):
#   print(sent)
#   print(embedding)


### TF-IDF

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidfvectoriser = TfidfVectorizer()
# tfidfvectoriser.fit(filtered3)
# tfidf_vectors = tfidfvectoriser.transform(filtered3)
# print(tfidf_vectors)

In [None]:
from collections import Counter

In [None]:
DF = {}
for word in inverted_doc_indexes.keys():
  DF[word] = len([doc for doc in inverted_doc_indexes[word]])

total_vocab_size = len(DF)
print()

{'sutra': 6, 'sthana': 6, 'chapter': 13, 'longevity': 8, 'abstract': 1, 'title': 1, 'charak': 7, 'samhita': 5, 's': 9, 'deerghanjiviteeya': 2, 'adhyaya': 2, 'literally': 1, 'means': 4, 'symbolic': 2, 'implies': 3, 'ayurveda': 34, 'merely': 1, 'science': 3, 'life': 21, 'deliberates': 2, 'purpose': 5, 'long': 7, 'health': 14, 'achieving': 2, 'primary': 2, 'instincts': 2, 'human': 10, 'beings': 6, 'purushartha': 1, 'chatustaya': 1, 'virtue': 3, 'dharma': 2, 'wealth': 4, 'artha': 2, 'gratification': 3, 'kama': 2, 'emancipation': 6, 'moksha': 3, 'basic': 4, 'tenets': 5, 'scope': 3, 'explained': 3, 'introduces': 1, 'fundamental': 5, 'principles': 5, 'including': 5, 'definition': 5, 'objective': 2, 'concept': 5, 'sharira': 4, 'bodily': 2, 'dosha': 14, 'qualities': 9, 'manasa': 1, 'mental': 4, 'treatment': 10, 'theory': 3, 'similarity': 6, 'samanya': 12, 'dissimilarity': 6, 'vishesha': 11, 'pharmaco': 1, 'therapeutic': 10, 'aspects': 6, 'classification': 5, 'drugs': 33, 'list': 2, 'herbs': 8, 

In [None]:
para_with_index[0]

In [None]:
tf_idf = {}
N = len(para_with_index)
for doc_id, tokens in para_with_index.items():
  counter = Counter(tokens)
  word_counts = len(tokens)

  for token in np.unique(tokens):
    tf = counter[token]
    tf = 1 + np.log(tf)
    if token in DF:
      df = DF[token]
    else:
      df = 0
    idf = np.log((N+1)/(df+1))

    tf_idf[doc_id, token] = tf*idf


### Ranked Search

In [None]:
def ranked_search(k, tf_idf, query):
  tokens = tokenize_text(query)
  query_weights = {}
  for doc_id, token in tf_idf:
    if token in tokens:
      query_weights[doc_id] = query_weights.get(doc_id, 0) + tf_idf[doc_id, token]
  query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
  results = []
  for i in query_weights[:k]:
    results.append(i[0])
  return results

In [None]:
results = ranked_search(4, tf_idf, 'suffering from worms')

suffering worms


In [None]:
[filtered2[i] for i in results]

['Urines have been prescribed as digestive stimulants, antidotes to poison and as vermicides. They are excellent for the treatment of persons suffering from pandu (anemia). [97.5]\n',
 'The urine of cow is slightly sweet, alleviates discordance of dosha. It cures worms, skin diseases (kushtha) and relieves itching. Its proper intake cures disorders of abdomen. [101]\n',
 'The urine of buffalo provides relief in piles, edema and disorders leading to general enlargement of abdomen (udara).  The urine of she-elephant is salty and provides relief in worms and dermatoses (kushtha).  It is also useful in the treatment of retention of feces and urine, poisoning, kapha disorders and piles. [102.5]\n']

## Euclidian Distance

In [334]:
# def Euclidian(question_vector, sentence_vector):
#   vec1 = question_vector.copy()
#   vec2 = sentence_vector.copy()
#   if len(vec1)<len(vec2): vec1,vec2 = vec2, vec1
#   vec2 = np.resize(vec2, (vec1.shape[0], vec1.shape[1]))
#   return np.linalg.norm(vec1-vec2)