In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import re
import nltk
from nltk.corpus import stopwords
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=20,
    length_function=len,
    add_start_index=True,
)
context = ''' CHRIST (Deemed to be University) was born out of the educational
vision of St Kuriakose Elias Chavara, an educationalist and social reformer of
the nineteenth century in South India. He founded the first Catholic
indigenous congregation, Carmelites of Mary Immaculate (CMI), in 1831
which administers CHRIST (Deemed to be University). “CHRIST (Deemed to
be University) was established as ‘Christ College’ in 1969. It undertook path-
breaking initiatives in Indian higher education with the introduction of
innovative and modern curricula, insistence on academic discipline,
imparting of Holistic Education and adoption of global higher education
practices with the support of creative and dedicated staff.” '''

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize words
    words = text.split()
    
    # Remove stopwords (optional)
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Join words back into a single string
    processed_text = ' '.join(words)
    
    return processed_text
context_split = text_splitter.create_documents([preprocess_text(context)])

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [4]:
context_split

[Document(metadata={'start_index': 0}, page_content='christ deemed university born educational vision'),
 Document(metadata={'start_index': 30}, page_content='educational vision st kuriakose elias chavara'),
 Document(metadata={'start_index': 62}, page_content='elias chavara educationalist social reformer'),
 Document(metadata={'start_index': 91}, page_content='social reformer nineteenth century south india'),
 Document(metadata={'start_index': 118}, page_content='century south india founded first catholic'),
 Document(metadata={'start_index': 146}, page_content='first catholic indigenous congregation carmelites'),
 Document(metadata={'start_index': 185}, page_content='carmelites mary immaculate cmi administers christ'),
 Document(metadata={'start_index': 216}, page_content='administers christ deemed university christ'),
 Document(metadata={'start_index': 242}, page_content='university christ deemed university established'),
 Document(metadata={'start_index': 278}, page_content='establ

In [5]:
chunks = text_splitter.split_text(preprocess_text(context))
print(chunks)

['christ deemed university born educational vision', 'educational vision st kuriakose elias chavara', 'elias chavara educationalist social reformer', 'social reformer nineteenth century south india', 'century south india founded first catholic', 'first catholic indigenous congregation carmelites', 'carmelites mary immaculate cmi administers christ', 'administers christ deemed university christ', 'university christ deemed university established', 'established christ college undertook path', 'undertook path breaking initiatives indian higher', 'indian higher education introduction innovative', 'innovative modern curricula insistence academic', 'insistence academic discipline imparting holistic', 'imparting holistic education adoption global', 'adoption global higher education practices', 'education practices support creative dedicated', 'creative dedicated staff']


[nltk_data] Downloading package stopwords to /home/kelvin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device},
)

  embed_model = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
2025-03-30 08:53:34.641209: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743324814.756053   18282 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743324814.780018   18282 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743324814.974788   18282 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743324814.974853   18282 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once

In [7]:
embeddings = embed_model.embed_documents(chunks)
print('lenght',len(embeddings))
print('dimensionality',embeddings[0])

lenght 18
dimensionality [0.08701148629188538, 0.06722182035446167, -0.022495025768876076, -0.011229769326746464, 0.03059849515557289, 0.0006962335901334882, 0.01364214438945055, -0.005227210931479931, 0.04084157571196556, 0.07418236881494522, 0.04943130910396576, -0.03290313109755516, -0.0488317646086216, -0.034486349672079086, -0.05321263149380684, -0.010268495418131351, -0.09371199458837509, -0.015773264691233635, 0.049736376851797104, -0.05655002221465111, -0.047613516449928284, 0.047575995326042175, -0.014775598421692848, -0.03463368862867355, 0.05689002573490143, 0.11817744374275208, 0.08308535069227219, -0.04827636852860451, 0.04178490489721298, -0.03138696029782295, 0.00635608471930027, -0.010772302746772766, 0.005325792357325554, -0.02653210237622261, -0.004914780613034964, 0.06823436915874481, 0.0792388990521431, 0.08523501455783844, 0.025733236223459244, -0.022617699578404427, -0.0360519103705883, 0.027239978313446045, 0.006809340324252844, 0.0073181940242648125, 0.083108283

In [None]:
import chromadb
from langchain.  import Chroma
chroma_client = chromadb.Client()
vector_db = Chroma(persist_directory="./chroma_db", embedding_function=embed_model, client=chroma_client)


  vector_db = Chroma(persist_directory="./chroma_db", embedding_function=embed_model, client=chroma_client)


In [15]:
vector_db.add_texts(chunks)

['fd2ec927-3cb3-456f-a6e8-19d60647fe7f',
 'c2244f79-7707-4560-b453-762d61c32c15',
 'cbdb94ea-4a9b-4cb6-b3c3-c56af4f235f3',
 '1e6c988f-000d-4f4e-8145-5dccc790f165',
 '40771d9f-c73a-4fc0-9fea-012eee1b27fe',
 '871573b3-ba79-48d4-a3d9-260b1abd404f',
 'eb61a52c-719c-466b-a0b9-08c135e5ea60',
 '9cd04a08-5127-4204-ab23-9664b5073dbb',
 'e61c73c5-27b2-4cb0-aa7b-1c747d505bb3',
 '38c57428-c32e-4a97-8607-559f0f28aa09',
 '51e71f2a-00d9-497d-ac00-865c47a16d66',
 '914d2603-e2ca-4bc5-a5a1-7927b3e9c9d4',
 'c0b5ec7b-bd7f-4ba9-8899-5fccedc0537c',
 '53557406-0fcb-414f-8018-aeade9b5adb0',
 '7665175d-415b-448e-9427-4ec7542201ae',
 'be36cb13-0fac-4186-9647-ae6854589b47',
 'a0bfffe0-15db-4a92-bc0f-c65c627f2cac',
 'ba4c214a-d1dd-49e3-ba2d-48ef060fef38']

In [21]:

questions = [
    'Who was the educational visionary behind CHRIST (Deemed to be University)?',
    'What congregation did St Kuriakose Elias Chavara establish in 1831?',
    'In which year was CHRIST (Deemed to be University) originally established as "Christ College"?',
    'What were some of the path-breaking initiatives taken by the university in Indian higher education?',
    'How did the university ensure the adoption of global higher education practices?'
]
answer = ""
for question in questions:
    results = vector_db.similarity_search(question, k=6)
    print(f"Question: {question}")
    print("Results:")
    for i, result in enumerate(results):
        # print(f"{i+1}. {result.page_content}")
        answer += result.page_content
    print(f"Answer: {answer}")
    answer = ""
    print("\n")




Question: Who was the educational visionary behind CHRIST (Deemed to be University)?
Results:
Answer: christ deemed university born educational visionuniversity christ deemed university establishedadministers christ deemed university christestablished christ college undertook patheducational vision st kuriakose elias chavarainnovative modern curricula insistence academic


Question: What congregation did St Kuriakose Elias Chavara establish in 1831?
Results:
Answer: educational vision st kuriakose elias chavaraelias chavara educationalist social reformerfirst catholic indigenous congregation carmelitescentury south india founded first catholicadministers christ deemed university christuniversity christ deemed university established


Question: In which year was CHRIST (Deemed to be University) originally established as "Christ College"?
Results:
Answer: university christ deemed university establishedestablished christ college undertook pathadministers christ deemed university christchr

In [None]:

questions = [
    'Who was the educational visionary behind CHRIST (Deemed to be University)?',
    'What congregation did St Kuriakose Elias Chavara establish in 1831?',
    'In which year was CHRIST (Deemed to be University) originally established as "Christ College"?',
    'What were some of the path-breaking initiatives taken by the university in Indian higher education?',
    'How did the university ensure the adoption of global higher education practices?'
]
for question in questions:
    results = collection.query(
        query_texts=[question],  # Chroma will embed this for you
        n_results=5  # how many results to return
    )
    print(f"Question: {question}")
    print("Results:")
    for i, result in enumerate(results['documents'][0]):
        print(f"{i+1}. {result}")
    print("\n")



In [26]:
from langchain.chains import RetrievalQA
from langchain_community.llms.ollama import Ollama
llm = Ollama(
    model="mistral", 
)

# Set up Retrieval-based QA
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vector_db.as_retriever())


In [27]:
response = llm.invoke("hello, how are you?")
print(response)

 I am just a computer program, so I don't have feelings or emotions. How can I help you today?

Computers like me exist to assist humans by processing information quickly and accurately, but remember that I don't have personal experiences or a sense of humor like a real person would. If you have any questions or need help with something, feel free to ask!

How can I assist you today?


In [None]:
for question in questions:
    print(question)
    answer = qa_chain.run(question)
    print(answer)
    print("\n")

Who was the educational visionary behind CHRIST (Deemed to be University)?


  answer = qa_chain.run(question)


 The educational visionary behind CHRIST (Deemed to be University) is not explicitly stated in the provided context. However, since it was "born" or conceptualized by Christ and the university was established under his name, it can be inferred that Christ might have been a significant individual associated with the University. To get an accurate answer, further research would be required to find more information about this individual.


What congregation did St Kuriakose Elias Chavara establish in 1831?
 St Kuriakose Elias Chavara established the first Catholic indigenous congregation, which was the Carmelites. However, it should be noted that the specific congregation he founded is typically referred to as the Carmelites of Mary Immaculate (CMI), not the Carmelites in the traditional sense. This congregation was founded in 1831 but gained official recognition from the Church much later in 1855, making it one of the earliest indigenous Catholic religious congregations in South India.



In [5]:
from transformers import pipeline
qa_pipe = pipeline("question-answering",model="deepset/roberta-base-squad2")

for question in questions:
    print(question)
    answer = qa_pipe(question=question,context=context)
    print(answer)
    print("\n")

SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /deepset/roberta-base-squad2/resolve/main/config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))"), '(Request ID: fa507423-306f-4304-b078-ecc3050ca958)')

In [4]:
%pip install tf-keras

Note: you may need to restart the kernel to use updated packages.
