In [38]:
import os
import glob
from PyPDF2 import PdfReader
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm
import re
import torch
import openai

In [40]:
api_key = "<key>"
openai.api_key = api_key

In [2]:
def filepathextractor(filetype="*.pdf"):
    file_paths = []
    for root, dirs, files in os.walk("."):
        for path in glob.glob(os.path.join(root, filetype)):
            file_paths.append(os.path.abspath(path))
    return file_paths

In [3]:
filepaths = filepathextractor()

In [4]:
filepaths

['/app/Download/Indian Society British Empire.pdf',
 '/app/Download/Portuguese India.pdf',
 '/app/Download/Vijayanagara.pdf',
 '/app/Download/Rural Bengal.pdf',
 '/app/Download/Commercial Enterprise India.pdf',
 '/app/Download/Architecture Mughal  India.pdf',
 '/app/Download/Economy Modern India.pdf',
 '/app/Download/Ideologies of the Raj.pdf',
 '/app/Download/Women in Modern India.pdf',
 '/app/Download/Mughal Empire.pdf',
 '/app/Download/Bengal_India.pdf',
 '/app/Download/Socio-religious reform movements in British India.pdf',
 '/app/Download/Sikhs of Punjab.pdf',
 '/app/Download/Marathas 1600 1818.pdf',
 '/app/Download/Caste_Society_Politics_India.pdf',
 '/app/Download/Science, Technology and Medicine in Colonial India.pdf',
 '/app/Download/Architecture and Art of Southern India.pdf']

In [5]:
# Keeping only unicode characters
def keep_unicode(text):
    unicode_pattern = re.compile('[^\u0000-\uD7FF\uE000-\uFFFF]', flags=re.UNICODE)
    return unicode_pattern.sub('', text)

In [6]:
# Extracting text from pdfs and creating the dataframe
def textdfcreator(filepaths):
    df = pd.DataFrame(columns=['title', 'text', 'path'])
    no_text_paths = []  # initialize list to store file paths with no extracted text
    print(f"Processed 0/{len(filepaths)} PDFs", end="\r")
    
    for i, fpath in enumerate(filepaths):
        main_text = ''
        reader = PdfReader(fpath)
        number_of_pages = len(reader.pages)
        for j in range(number_of_pages):
            page = reader.pages[j]
            text = page.extract_text()
            if text is not None:  # check if text was extracted
                main_text += text
            else:
                print(f"No text was extracted from {fpath}")
                no_text_paths.append(fpath)  # add file path to list of paths with no extracted text
        df_row = {'title': os.path.basename(fpath),
                  'text': main_text,
                  'path': fpath}
        df = pd.concat([df, pd.DataFrame(df_row, index=[0])], ignore_index=True)
        print(f"Processed {i+1}/{len(filepaths)} PDFs", end="\r")
    df['text'] = df['text'].apply(keep_unicode)
    print(f"\nThe following files had no extracted text: {no_text_paths}")
    df = df.drop(df[df['text'].isnull()].index)
    return df

`%store` – Show list of all variables and their current values
`%store spam bar` – Store the current value of the variables spam and bar to disk
`%store -d spam` – Remove the variable and its value from storage
`%store -z` – Remove all variables from storage
`%store -r` – Refresh all variables, aliases and directory history from store (overwrite current values)
`%store -r spam bar` – Refresh specified variables and aliases from store (delete current variables)
`%store foo >a.txt` – Store value of foo to new file a.txt
`%store foo >>a.txt` – Append value of foo to file a.txt

In [7]:
# Run only for new set of documents
# df = textdfcreator(filepaths)

In [8]:
%store

Stored variables and their in-db values:
df             ->                                                 ti


In [9]:
%store -r

In [10]:
df.shape

(17, 3)

In [11]:
df.head()

Unnamed: 0,title,text,path
0,Indian Society British Empire.pdf,THE NEW CAMBRIDGE HISTORY\nOF INDIA\nIndian so...,/app/Download/Indian Society British Empire.pdf
1,Portuguese India.pdf,THE NEW CAMBRIDGE HISTORY\nOF INDIA\nThe Portu...,/app/Download/Portuguese India.pdf
2,Vijayanagara.pdf,THE NEW CAMBRIDGE HISTORY\nOF INDIA\nVijayanag...,/app/Download/Vijayanagara.pdf
3,Rural Bengal.pdf,The last two decades have witnessed 'the retur...,/app/Download/Rural Bengal.pdf
4,Commercial Enterprise India.pdf,European traders first appeared in India at th...,/app/Download/Commercial Enterprise India.pdf


In [11]:
df.to_csv('history_of_india.csv')

In [12]:
df.columns

Index(['title', 'text', 'path'], dtype='object')

In [13]:
# compile regular expressions
sentence_end_re = re.compile('[.?!][\'"\)\]]*$|[.?!]\n')
sentence_start_re = re.compile('^[A-Z][a-z]*|[\'"\(\[\n]')

# define a function to split text into chunks of 190 words each
def split_text(text):
    words = text.split()
    chunks = []
    start_idx = 0
    while start_idx < len(words):
        end_idx = min(start_idx + 190, len(words))
        chunk = ' '.join(words[i] for i in range(start_idx, end_idx))
        # check if the chunk ends with a full sentence
        if sentence_end_re.search(chunk):
            chunks.append(chunk)
            start_idx = end_idx
        else:
            # if the chunk doesn't end with a full sentence, expand it to the next sentence
            while end_idx < len(words):
                if sentence_start_re.search(words[end_idx]):
                    chunk = ' '.join(words[i] for i in range(start_idx, end_idx+1))
                    chunks.append(chunk)
                    start_idx = end_idx + 1
                    break
                else:
                    end_idx += 1
            else:
                # if no sentence boundary is found, add the remaining words as a chunk
                chunk = ' '.join(words[i] for i in range(start_idx, end_idx))
                chunks.append(chunk)
                start_idx = end_idx
    return chunks

In [14]:
# apply the split_text function to the text column
df['text'] = df['text'].apply(split_text)

In [15]:
# flatten the list of chunks and remove leading/trailing whitespace characters and '\n'
df['text'] = df['text'].apply(lambda x: [s.strip().replace('\n', '') for s in x])

In [16]:
# create new rows with the split text
df = df.explode('text').reset_index(drop=True)

In [17]:
df.shape

(7510, 3)

In [18]:
# Dropping empty text rows
df = df.drop(df[df['text'].isnull()].index)

In [19]:
df.shape

(7509, 3)

In [20]:
df.to_csv('processed_history_of_india.csv')

In [25]:
sentences = df.text.to_list()

In [26]:
len(sentences)

7509

In [29]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [30]:
with torch.no_grad():
    embeddings = model.encode(sentences, device='cuda')

In [31]:
len(embeddings)

7509

In [33]:
df['embeddings'] = list(embeddings)

In [34]:
df.to_pickle('history_india.pkl.gz', compression='gzip')

In [35]:
df_final = df.copy()

In [36]:
%store df_final

Stored 'df_final' (DataFrame)


In [37]:
%store

Stored variables and their in-db values:
df                   ->                                                 ti
df_final             ->                                            title  


In [41]:
# Load pickle from disk
df = pd.read_pickle('history_india.pkl.gz', compression='gzip')

In [42]:
df.head()

Unnamed: 0,title,text,path,embeddings
0,Indian Society British Empire.pdf,THE NEW CAMBRIDGE HISTORY OF INDIA Indian soci...,/app/Download/Indian Society British Empire.pdf,"[-0.041346293, -0.06831048, 0.00954161, -0.028..."
1,Indian Society British Empire.pdf,four parts planned are as follows: I The Mugha...,/app/Download/Indian Society British Empire.pdf,"[-0.03494044, -0.0021976398, -0.0027277428, 0...."
2,Indian Society British Empire.pdf,C. A. Indian society and the making of the Bri...,/app/Download/Indian Society British Empire.pdf,"[-0.027762491, -0.038931817, -0.036458507, -0...."
3,Indian Society British Empire.pdf,The first age of colonialism in India 200 Glos...,/app/Download/Indian Society British Empire.pdf,"[-0.036183123, -0.07188498, 0.014481022, -0.04..."
4,Indian Society British Empire.pdf,"History was a great success, and it was follow...",/app/Download/Indian Society British Empire.pdf,"[-0.041959874, -0.10230567, -0.0008908011, -0...."


In [43]:
from chromadb.config import Settings
client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="embedDB" # Optional, defaults to .chromadb/ in the current directory
))

Using embedded DuckDB with persistence: data will be stored in: embedDB


In [44]:
# Get a collection object from an existing collection, by name. If it doesn't exist, create it.
collection = client.get_or_create_collection(name="history_of_india")

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


In [45]:
# Preparing data for chromadb function
def prep_data(df, text_source_column = 'text', embeddings_source = 'embeddings', columns_meta=['title','path']):
    #generating docs
    docs = df[text_source_column].to_list()
    
    #Generating ids
    ids = df[text_source_column].index.to_list()
    ids = [f'id{i}' for i in ids]
    
    #generate embeddings list
    embeddings = df[embeddings_source].to_list()
    
    # convert the dataframe to a dictionary
    data_dict = df[columns_meta].to_dict('records')
    # create a list of dictionaries with only the columns you need
    metadata = [{ columns_meta[0]: row[columns_meta[0]], columns_meta[1]: row[columns_meta[1]] } for row in data_dict]
    return docs, embeddings, metadata, ids

In [46]:
docs, embeddings, metadata, ids = prep_data(df)

In [47]:
collection.add(
    documents=docs,
    embeddings=embeddings,
    metadatas=metadata,
    ids=ids
)

In [None]:
client.persist()

In [49]:
client.heartbeat()

1683926171010016466000

In [50]:
# Function to search results
def v_search(query_text, num=5):
    # Get a collection object from an existing collection, by name. If it doesn't exist, create it.
    collection = client.get_or_create_collection(name="history_of_india")
    resp = collection.query(
    query_texts=[query_text],
    n_results=num,  
    )
    rep = [''.join(sent) for sent in resp['documents']][0]
    return rep

In [74]:
question = "who was abdali"

In [78]:
resp = collection.query(
    query_texts=[question],
    n_results=2,  
)

In [79]:
rep = [''.join(sent) for sent in resp['documents']][0]

In [80]:
rep

"into idiomatic Urdu. The teachings of Shah Wali 'Ullah and the Delhi School of Islamic thought plus the doctrines of al-Wahhab of Sa'udi Arabia constituted a basic frame of reference for socio-religious movements among South Asian Muslims, as exempli\xad fied by the Fara'izis of Bengal. Shari'at 'Ullah, the founder of the Fara'izis, was born in 1781 in th e village of Shmail in eastern Bengal. He received his elementary education in Calcutta and Hughly. In 1799, at the age of eighteen, Shari'at 'Ullah left for Mecca. The first two years he studied under an emigrant Bengali, Maulana Murad, and for the next fourteen years became the student of the Hanafi scholar, Tahir Sombal. Shari'at 'Ullah was also initiated into the Qadiriyah order of Sufism during this period. In addition he spent two years at al-Azhar University in Cairo. When he returned to Bengal in 1818 as a scholar of Islamic law and philosophy,1 he began preaching, but soon returned to Mecca, where he obtained the formal perm

In [81]:
query = f"Summarize answer to following question: {question}, from following response: {rep}"
query

"Summarize answer to following question: who was abdali, from following response: into idiomatic Urdu. The teachings of Shah Wali 'Ullah and the Delhi School of Islamic thought plus the doctrines of al-Wahhab of Sa'udi Arabia constituted a basic frame of reference for socio-religious movements among South Asian Muslims, as exempli\xad fied by the Fara'izis of Bengal. Shari'at 'Ullah, the founder of the Fara'izis, was born in 1781 in th e village of Shmail in eastern Bengal. He received his elementary education in Calcutta and Hughly. In 1799, at the age of eighteen, Shari'at 'Ullah left for Mecca. The first two years he studied under an emigrant Bengali, Maulana Murad, and for the next fourteen years became the student of the Hanafi scholar, Tahir Sombal. Shari'at 'Ullah was also initiated into the Qadiriyah order of Sufism during this period. In addition he spent two years at al-Azhar University in Cairo. When he returned to Bengal in 1818 as a scholar of Islamic law and philosophy,1 

In [83]:
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": query}
  ]
)

In [84]:
print(completion.choices[0].message['content'])

The answer to the question is not mentioned in the provided response.
