In [11]:
import json
import pandas as pd
import glob, os
import re
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Dataset

In [12]:
# # Read the XLSX file
# df = pd.read_excel("/content/dataset_mergedpdf_8th_std_ALL_SCIENCE.xlsx")

# # Save the DataFrame as a CSV file
# df.to_csv("/content/dataset_mergedpdf_8th_std_ALL_SCIENCE.csv", index=False)

In [13]:
# data = pd.read_csv("/content/dataset_mergedpdf_8th_std_ALL_SCIENCE.csv")
# data.head(1)

## Convert dataset to .csv files and merge files.

1.   Read all .xlsx files
2.   merge All csv files into one

In [14]:
final_csv_path = '/content/combined_dataset.csv'

In [15]:
import pandas as pd
import glob
def merge_csv_files(folder_path : str):
    # Path to the folder containing the .csv files
    # folder_path = '/content'

    # Use glob to get all .csv files in the folder
    file_paths = glob.glob(f"{folder_path}/*.csv")
    print(file_paths)
    if final_csv_path in file_paths:
        file_paths.remove(final_csv_path)
        os.remove(final_csv_path)
        print(f"File {final_csv_path} removed successfully.")

    # Initialize an empty list to store DataFrames
    data_frames = []

    # Loop through the file paths and read each file
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        data_frames.append(df)

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(data_frames, ignore_index=True)

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv('combined_dataset.csv', index=False)

    print("Combined dataset saved to 'combined_dataset.csv'.")

### split distractors array into multiple columns

In [16]:
import ast  # For literal evaluation of the list
def split_distractors(data: pd.DataFrame):
    # Assuming there are 3 distractors
    if 'Distractor' in data.columns:
        data['Distractor'] = data['Distractor'].apply(ast.literal_eval)
        data['Distractor1'] = data['Distractor'].apply(lambda x: x[0] if len(x) > 0 else None)
        data['Distractor2'] = data['Distractor'].apply(lambda x: x[1] if len(x) > 1 else None)
        data['Distractor3'] = data['Distractor'].apply(lambda x: x[2] if len(x) > 2 else None)
        data.drop(columns=['Distractor'], inplace=True)
    return data

### data cleaning like remove non-alphabetes

In [17]:
def remove_nonalphabetes(text: str):
    pattern = r"\d|[^\w\s]"   # Matches one or more characters that are not a-z or A-Z
    try:
        # Substitute matched characters with an empty string (removal)
        processed_text = re.sub(pattern, "", text)
    except:
        print("Error in data cleaning")
    return processed_text

In [18]:
folder_path = '/content'
file_paths = glob.glob(f"{folder_path}/*.xlsx")
if len(file_paths) > 0:
    for file_path in file_paths:
        try:
            new_file_path = f"{os.path.splitext(file_path)[0]}.csv"
            # Read the XLSX file
            df = pd.read_excel(file_path)

            df = split_distractors(df)
            df['Question'] = df['Question'].apply(remove_nonalphabetes)

            # Save the DataFrame as a CSV file
            df.to_csv(new_file_path, index=False)
        except:
            print(f"Error reading file or saving to csv format: {file_path}")
        os.remove(file_path);
        print(f"File {file_path} removed successfully.")
else:
    print("No Excel files found.")
merge_csv_files(folder_path)

No Excel files found.
['/content/dataset_chapter9.csv', '/content/dataset_Cell.csv', '/content/combined_dataset.csv', '/content/dataset_CropProd_corrected.csv', '/content/dataset_Ch_7_Reaching the Age of Adolescence.csv', '/content/dataset_Microorganisms.csv', '/content/dataset_ConservationOfPlants.csv']
File /content/combined_dataset.csv removed successfully.
Combined dataset saved to 'combined_dataset.csv'.


# Load and preprocess the SciQ dataset

In [19]:
dataset_filepath = '/content/combined_dataset.csv'
cdf = pd.read_csv(dataset_filepath)
print(cdf.shape)
cdf.head()

(695, 5)


Unnamed: 0,Question,Answer,D1,D2,D3
0,1. What life process ensures a species will no...,Reproduction is the life process that ensures ...,Humans reproduce by cloning method.,Humans reproduce by spore formation method.,Humans reproduce by photosynthesis method.
1,2. What are the names of the reproductive proc...,The reproductive process involving two parents...,Humans reproduce by baking method.,Humans reproduce by telepathic method.,Humans reproduce by spontaneous generation met...
2,3. Which animals reproduce sexually and which ...,Animals that reproduce sexually include humans...,Humans reproduce by spore formation method.,Humans reproduce by photosynthesis method.,Humans reproduce by teleportation method.
3,4. Do human beings reproduce by sexual or asex...,Human beings reproduce by sexual methods.,Humans reproduce by spore formation.,Humans reproduce by budding.,Humans reproduce by fragmentation.
4,5. What are two examples of animals that repro...,Two examples of animals that reproduce sexuall...,Humans reproduce by cloning cells.,Humans reproduce by self-fertilization.,Humans reproduce by spore formation.


In [31]:
import re

text = "This string has numbers (123) and special characters!@#$%^&*."

# Regex pattern to match digits and special characters
pattern = r"\d|[^\w\s]"

# Substitute matched characters with an empty string (removal)
processed_text = re.sub(pattern, "", text)

print(processed_text)


This string has numbers  and special characters


In [32]:
import re
str_val = cdf['Question'][0]
re.sub(r'\d|[^\w\s]', '', str_val)

' What life process ensures a species will not disappear from the earth'

##Load sciQ dataset

In [None]:
# Function to load SciQ dataset (same as before)
def load_sciq_dataset(path):
    with open(path, 'r') as file:
        sciq_data = json.load(file)

    data = []
    for item in sciq_data:
        question = item['question']
        correct_answer = item['correct_answer']
        distractors = [item['distractor1'], item['distractor2'], item['distractor3']]

        # Positive pair
        data.append({'question': question, 'answer': correct_answer, 'label': 1})

        # Negative pairs
        for distractor in distractors:
            data.append({'question': question, 'answer': distractor, 'label': 0})

    return pd.DataFrame(data)

# Paths to the datasets
train_path = '/content/train.json'
val_path = '/content/valid.json'
test_path = '/content/test.json'

# Load the datasets
train_df = load_sciq_dataset(train_path)
val_df = load_sciq_dataset(val_path)
test_df = load_sciq_dataset(test_path)

# Display the dataframes
print("Training Data")
print(train_df.head())
print("Validation Data")
print(val_df.head())
print("Test Data")
print(test_df.head())


Training Data
                                            question                answer  \
0  What type of organism is commonly used in prep...  mesophilic organisms   
1  What type of organism is commonly used in prep...              protozoa   
2  What type of organism is commonly used in prep...           gymnosperms   
3  What type of organism is commonly used in prep...               viruses   
4  What phenomenon makes global winds blow northe...       coriolis effect   

   label  
0      1  
1      0  
2      0  
3      0  
4      1  
Validation Data
                                            question    answer  label
0  Who proposed the theory of evolution by natura...    darwin      1
1  Who proposed the theory of evolution by natura...  Linnaeus      0
2  Who proposed the theory of evolution by natura...      shaw      0
3  Who proposed the theory of evolution by natura...    Scopes      0
4  Each specific polypeptide has a unique linear ...     amino      1
Test Data
     

In [None]:
# Function to preprocess data
# Tokenize and pad sequences
def preprocess_data(df, tokenizer, max_sequence_length):
    questions = df['question'].values
    answers = df['answer'].values
    labels = df['label'].values

    question_sequences = tokenizer.texts_to_sequences(questions)
    answer_sequences = tokenizer.texts_to_sequences(answers)

    question_data = pad_sequences(question_sequences, maxlen=max_sequence_length)
    answer_data = pad_sequences(answer_sequences, maxlen=max_sequence_length)

    return question_data, answer_data, labels

# Prepare data for training

In [None]:
# Parameters
max_vocab_size = 20000
max_sequence_length = 100
embedding_dim = 100


In [None]:
# Preprocess training, validation, and test data
tokenizer = Tokenizer(num_words=max_vocab_size)
train_question_data, train_answer_data, train_labels = preprocess_data(train_df, tokenizer, max_sequence_length)
val_question_data, val_answer_data, val_labels = preprocess_data(val_df, tokenizer, max_sequence_length)
test_question_data, test_answer_data, test_labels = preprocess_data(test_df, tokenizer, max_sequence_length)

 # Define the Siamese network

In [None]:
# Define the Siamese network
def create_base_network(input_shape):
    input = Input(shape=input_shape)
    x = Embedding(max_vocab_size, embedding_dim)(input)
    x = LSTM(64)(x)
    x = Dense(128, activation='relu')(x)
    return Model(input, x)


In [None]:
input_shape = (max_sequence_length,)
base_network = create_base_network(input_shape)
# Create the two inputs
input_question = Input(shape=input_shape, name='input_question')
input_answer = Input(shape=input_shape, name='input_answer')
# Encode each of the two inputs into a vector with the base network
encoded_question = base_network(input_question)
encoded_answer = base_network(input_answer)

In [None]:
# Merge the two vectors with a dot product
merged = Concatenate()([encoded_question, encoded_answer])
merged = Dense(128, activation='relu')(merged)
merged = Dense(1, activation='sigmoid')(merged)
# Create the model
model = Model(inputs=[input_question, input_answer], outputs=merged)

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Train model

In [None]:
# Train the model with validation data
model.fit(
    [train_question_data, train_answer_data], train_labels,
    epochs=10,
    batch_size=32,
    validation_data=([val_question_data, val_answer_data], val_labels)
)
print("Model training complete!")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79ddb28cbd30>

In [None]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate([test_question_data, test_answer_data], test_labels)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Test Loss: 0.5631213784217834
Test Accuracy: 0.75


In [None]:
# Function to preprocess new data
def preprocess_texts(texts, tokenizer, max_sequence_length):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)
    return padded_sequences


In [None]:
# Function to predict the best answer for a new question
def predict_best_answer(question, answer_options, tokenizer, model, max_sequence_length):
    question_data = preprocess_texts([question] * len(answer_options), tokenizer, max_sequence_length)
    answer_data = preprocess_texts(answer_options, tokenizer, max_sequence_length)

    predictions = model.predict([question_data, answer_data])

    best_answer_index = np.argmax(predictions)
    best_answer = answer_options[best_answer_index]

    return best_answer, predictions


In [None]:
# Example usage
new_question = "What type of organism is commonly used in preparation of foods such as cheese and yogurt?"
answer_options = ["mesophilic organisms", "viruses", "gymnosperms", "protozoa"]

best_answer, predictions = predict_best_answer(new_question, answer_options, tokenizer, model, max_sequence_length)

print(f"Best Answer: {best_answer}")
print(f"Predictions: {predictions}")

Best Answer: mesophilic organisms
Predictions: [[0.26742664]
 [0.26742664]
 [0.26742664]
 [0.26742664]]
