## Context based Question Answering System

### Loading the context and question

In [34]:
contexts = "CHRIST (Deemed to be University) was born out of the educational vision of St Kuriakose Elias Chavara, an educationalist and social reformer of the nineteenth century in South India. He founded the first Catholic indigenous congregation, Carmelites of Mary Immaculate (CMI), in 1831 which administers CHRIST (Deemed to be University). CHRIST (Deemed to be University) was established as Christ College in 1969. It undertook path- breaking initiatives in Indian higher education with the introduction of innovative and modern curricula, insistence on academic discipline, imparting of Holistic Education and adoption of global higher education practices with the support of creative and dedicated staff."

### Preprocessing

In [35]:
contexts=contexts.lower()

In [36]:
import string
for punctuation in string.punctuation:
    contexts=contexts.replace(punctuation, '')

### Name Entity Recognition

In [31]:
import spacy
nlp = spacy.load("en_core_web_sm")
tokenized_sentences = []
ner_labels = []

for context in contexts:
    doc = nlp(context)
    tokenized_sentence = []
    labels = []
    for token in doc:
        tokenized_sentence.append(token.text)
        labels.append(token.ent_iob_ + "-" + token.ent_type_ if token.ent_iob_ != "O" else "O")
    tokenized_sentences.append(tokenized_sentence)
    ner_labels.append(labels)

label_to_index = {
    "O": 0,
    "B-PER": 1, "I-PER": 2,
    "B-ORG": 3, "I-ORG": 4,
    "B-DATE": 5, "I-DATE": 6,
    "B-GPE": 7, "I-GPE": 8,
    "B-ORDINAL": 9, "B-NORP":10,
    "B-CARDINAL":11
}
indexed_labels = [[label_to_index[label] for label in labels] for labels in ner_labels]

### Padding and Tokenizing

In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(tokenized_sentences)

# Padding sequences
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

### One hot encoding

In [33]:
num_classes = len(label_to_index)
labels_one_hot = np.eye(num_classes)[padded_labels]

### Building LSTM model

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense

embedding_dim = 50
vocab_size = len(word_index) + 1
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(100, return_sequences=True),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(padded_sequences, labels_one_hot, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x31c7ab5d0>

### Predicting the answer

In [25]:
user_input = "What year was CHRIST university established?"

user_input_tokens = user_input.split()
user_input_sequences = tokenizer.texts_to_sequences([user_input_tokens])
user_input_padded = pad_sequences(user_input_sequences, maxlen=max_sequence_length)

predicted_labels_one_hot = model.predict(user_input_padded)
predicted_labels_indices = np.argmax(predicted_labels_one_hot, axis=-1)
index_to_label = {v: k for k, v in label_to_index.items()}
predicted_labels = [index_to_label[idx] for idx in predicted_labels_indices[0]]

answer = ""
for token, label in zip(user_input_tokens, predicted_labels):
    if label != 'O':
        answer += token + " "
corrected_answer = ''

for word in answer.split():

    #If it's a subword token
    if word[0:2] == '##':
        corrected_answer += word[2:]
    else:
        corrected_answer += ' ' + word

print(corrected_answer)

 1969
