# Combine Data and embed 

Sources:  
1. Tinytales
2. Wikipedia, and
3. Tilaks Kaggle dataset

In [120]:
import os
import statistics
import json
from pathlib import Path
import pandas as pd
import tiktoken
from dotenv import load_dotenv
load_dotenv('../.env') 

True

In [46]:
## For embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector

## Loaders
from langchain.document_loaders import DataFrameLoader

## Combine Data

In [47]:
directory = Path("../text")
directory_tilak = Path("../text/KaggleTilak/books")
directory_tinytales = Path("../text/TinyTales")
directory_wikipedia = Path("../text/Wikipedia")

### Read files into dataframes

In [48]:
kaggle_data = pd.read_csv(directory_tilak/'complete_text_lines.csv', sep=";")
kaggle_data.columns

Index(['book_number', 'book_name', 'chapter_name', 'title', 'commentary',
       'text', 'section_number', 'source'],
      dtype='object')

In [49]:
tinytales_data = pd.read_csv(directory_tinytales/'mahabharata_tiny_tales_stories.csv', sep=";")
tinytales_data.columns

Index(['text', 'section_number', 'title', 'chapter_number', 'chapter_name',
       'source'],
      dtype='object')

In [50]:
wikipedia_data = pd.read_csv(directory_wikipedia/'wikipedia_parva_summary.csv', sep=";")

## Droping unnecessary columns
wikipedia_data.drop(['start_chapter', 'end_chapter'], axis=1, inplace=True)
wikipedia_data.columns

Index(['book', 'source', 'title', 'book_number', 'description', 'text'], dtype='object')

### Combine the dataframes into one bid dataframe

In [51]:
df_combined = pd.concat([kaggle_data, tinytales_data, wikipedia_data])
print(
    "Kaggle data dims",  kaggle_data.shape, "\n",
    "TinyTales data dims", tinytales_data.shape, "\n",
    "Wikipedia data dims", wikipedia_data.shape, "\n",
    "Final data dims", df_combined.shape)

print("Final data columns \n", df_combined.columns)

Kaggle data dims (2376, 8) 
 TinyTales data dims (200, 6) 
 Wikipedia data dims (19, 6) 
 Final data dims (2595, 11)
Final data columns 
 Index(['book_number', 'book_name', 'chapter_name', 'title', 'commentary',
       'text', 'section_number', 'source', 'chapter_number', 'book',
       'description'],
      dtype='object')


### Calculate tokens for each text row

In [52]:

encoder_name = "cl100k_base"
encoding = tiktoken.get_encoding(encoder_name)
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

## Calculat the text tokens per row
df_combined['num_text_tokens'] = df_combined['text'].apply(num_tokens_from_string)

print("Average tokens per row", statistics.mean(df_combined['num_text_tokens']))
print("Total number of tokens", sum(df_combined['num_text_tokens']))

Average tokens per row 167.22851637764933
Total number of tokens 433958


### Write the final dataframe into a csv file

In [53]:

df_combined.to_csv(directory/'combined.csv', index=False, sep=";")
print(df_combined.dtypes)

book_number        float64
book_name           object
chapter_name        object
title               object
commentary          object
text                object
section_number     float64
source              object
chapter_number     float64
book                object
description         object
num_text_tokens      int64
dtype: object


## Embedd and persist into PG Vector store

### Load Vector store

In [54]:
text_embedding_model = "text-embedding-ada-002"

CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver="psycopg2",
    host="localhost",
    port="5432",
    database=os.environ["PGVECTOR_DATABASE"],
    user=os.environ["PGVECTOR_USER"],
    password=os.environ["PGVECTOR_PASSWORD"],
)

COLLECTION_NAME = "mh_embeddings_summaries"

embedding = OpenAIEmbeddings(model=text_embedding_model)

store = PGVector(
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    embedding_function=embedding,
)

### Load the dataframe into a loader

In [55]:
loader = DataFrameLoader(df_combined, page_content_column="text")

In [56]:
docs = loader.load()

In [321]:
# docs[0].page_content

## Named Entity recognition 

## Helper functions

In [308]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


In [315]:

## Helper funciton to combine tokens into names
def combine_tokens(ner_results):
    name = ""
    entities = []
    for res in ner_results:
        word = res['word']
        if word[0] == "▁":
            if not name == "":
                entities = entities + [{'name': name, 'entity': entity}]
            name = word[1:]
            entity = res['entity']
        elif not word in [',', "'", ".", 's', "'", ";", "(", ")"]:
            name = name + word
    
    ## append the last name
    entities = entities + [{'name': name, 'entity': entity}]
    ## Return
    return entities

## Get names entities
def recognise_named_entities(text, pipeline_model):
    ner_results = pipeline_model(text)
    return ner_results


In [316]:
# text = docs[100].page_content
# print(text)

text = """
"Thou hast heard, O Raja, of the greatly powerful men of vast exertions,
spoken of by Vyasa and the wise Narada; men born of great royal families,
resplendent with worthy qualities, versed in the science of celestial
arms, and in glory emblems of Indra; men who having conquered the world
by justice and performed sacrifices with fit offerings (to the
Brahmanas), obtained renown in this world and at last succumbed to the
sway of time. Such were Saivya; the valiant Maharatha; Srinjaya, great
amongst conquerors. Suhotra; Rantideva, and Kakshivanta, great in glory;
Valhika, Damana, Saryati, Ajita, and Nala; Viswamitra the destroyer of
foes; Amvarisha, great in strength; Marutta, Manu, Ikshaku, Gaya, and
Bharata; Rama the son of Dasaratha; Sasavindu, and Bhagiratha;
Kritavirya, the greatly fortunate, and Janamejaya too; and Yayati of good
deeds who performed sacrifices, being assisted therein by the celestials
themselves, and by whose sacrificial altars and stakes this earth with
her habited and uninhabited regions hath been marked all over. These
twenty-four Rajas were formerly spoken of by the celestial Rishi Narada
unto Saivya when much afflicted for the loss of his children. Besides
these, other Rajas had gone before, still more powerful than they, mighty
charioteers noble in mind, and resplendent with every worthy quality.
These were Puru, Kuru, Yadu, Sura and Viswasrawa of great glory; Anuha,
Yuvanaswu, Kakutstha, Vikrami, and Raghu; Vijava, Virihorta, Anga, Bhava,
Sweta, and Vripadguru; Usinara, Sata-ratha, Kanka, Duliduha, and Druma;
Dambhodbhava, Para, Vena, Sagara, Sankriti, and Nimi; Ajeya, Parasu,
Pundra, Sambhu, and holy Deva-Vridha; Devahuya, Supratika, and
Vrihad-ratha; Mahatsaha, Vinitatma, Sukratu, and Nala, the king of the
Nishadas; Satyavrata, Santabhaya, Sumitra, and the chief Subala;
Janujangha, Anaranya, Arka, Priyabhritya, Chuchi-vrata, Balabandhu,
Nirmardda, Ketusringa, and Brhidbala; Dhrishtaketu, Brihatketu,
Driptaketu, and Niramaya; Abikshit, Chapala, Dhurta, Kritbandhu, and
Dridhe-shudhi; Mahapurana-sambhavya, Pratyanga, Paraha and Sruti. These,
O chief, and other Rajas, we hear enumerated by hundreds and by
thousands, and still others by millions, princes of great power and
wisdom, quitting very abundant enjoyments met death as thy sons have
done! Their heavenly deeds, valour, and generosity, their magnanimity,
faith, truth, purity, simplicity and mercy, are published to the world in
the records of former times by sacred bards of great learning. Though
endued with every noble virtue, these have yielded up their lives. Thy
sons were malevolent, inflamed with passion, avaricious, and of very
evil-disposition. Thou art versed in the Sastras, O Bharata, and art
intelligent and wise; they never sink under misfortunes whose
understandings are guided by the Sastras. Thou art acquainted, O prince,
with the lenity and severity of fate; this anxiety therefore for the
safety of thy children is unbecoming. Moreover, it behoveth thee not to
grieve for that which must happen: for who can avert, by his wisdom, the
decrees of fate? No one can leave the way marked out for him by
Providence. Existence and non-existence, pleasure and pain all have Time
for their root. Time createth all things and Time destroyeth all
creatures. It is Time that burneth creatures and it is Time that
extinguisheth the fire. All states, the good and the evil, in the three
worlds, are caused by Time. Time cutteth short all things and createth
them anew. Time alone is awake when all things are asleep: indeed, Time
is incapable of being overcome. Time passeth over all things without
being retarded. Knowing, as thou dost, that all things past and future
and all that exist at the present moment, are the offspring of Time, it
behoveth thee not to throw away thy reason.'
"""

## Roberta Named Entity

In [317]:
## Roberta based NER

roberta_tokenizer = AutoTokenizer.from_pretrained("2rtl3/mn-xlm-roberta-base-named-entity")
roberta_model = AutoModelForTokenClassification.from_pretrained("2rtl3/mn-xlm-roberta-base-named-entity")
nlp_roberta = pipeline("ner", model=roberta_model, tokenizer=roberta_tokenizer)
roberta_model.num_parameters()



277456901

In [318]:
ner_results = recognise_named_entities(text, nlp_roberta)
entities = combine_tokens(ner_results)
df_roberta = pd.DataFrame(entities)

# print(text)
# for entity in entities:
#     print(entity)
# for res in ner_results:
#     print(res)


## Arbert NER model

In [319]:
arbert_tokenizer = AutoTokenizer.from_pretrained("ArBert/albert-base-v2-finetuned-ner")
arbert_model = AutoModelForTokenClassification.from_pretrained("ArBert/albert-base-v2-finetuned-ner")
nlp_arbert = pipeline("token-classification", model=arbert_model, tokenizer=arbert_tokenizer)
arbert_model.num_parameters()

11099913

In [320]:
ner_results = recognise_named_entities(text, nlp_arbert)
entities = combine_tokens(ner_results)
df_arbert = pd.DataFrame(entities)
df_arbert = df_arbert.loc[df_arbert['entity'] != 'LABEL_0']

# print(text)
# for res in ner_results:
#     print(res)
# for entity in entities:
#     print(entity)


## IndicBert Model

In [274]:
# from transformers import AutoModel, AutoTokenizer
# import torch

# tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
# model = AutoModel.from_pretrained('ai4bharat/indic-bert')

# inputs = tokenizer("After Abhimanyu's marriage, there was royal festival and everyone was pleased", return_tensors="pt")

# with torch.no_grad():
#     outputs = model(**inputs)

# outputs.pooler_output.squeeze()
