In [1]:
pip install datasets transformers

Note: you may need to restart the kernel to use updated packages.


In Python, the transformers library is associated with the Hugging Face Transformers library, which provides a collection of pre-trained models for natural language processing (NLP) tasks.

In [2]:
pip install transformers[torch]

Collecting torch!=1.12.0,>=1.9 (from transformers[torch])
  Obtaining dependency information for torch!=1.12.0,>=1.9 from https://files.pythonhosted.org/packages/d6/a8/43e5033f9b2f727c158456e0720f870030ad3685c46f41ca3ca901b54922/torch-2.1.1-cp311-cp311-win_amd64.whl.metadata
  Using cached torch-2.1.1-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting accelerate>=0.20.3 (from transformers[torch])
  Obtaining dependency information for accelerate>=0.20.3 from https://files.pythonhosted.org/packages/13/9e/ee987874058f2d93006961f6ff49e0bcb60ab9c26709ebe06bfa8707a4d8/accelerate-0.24.1-py3-none-any.whl.metadata
  Using cached accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)
Using cached accelerate-0.24.1-py3-none-any.whl (261 kB)
Using cached torch-2.1.1-cp311-cp311-win_amd64.whl (192.3 MB)
Installing collected packages: torch, accelerate
Successfully installed accelerate-0.24.1 torch-2.1.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
!apt install git-lfs

'apt' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
#importing transformers
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [83]:
# Let's import the public training set and take a look
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset

train_df = pd.read_csv('Data/train.csv')
print(type(train_df))
train_df.head()
train_ds = Dataset.from_pandas(train_df)
train_ds

<class 'pandas.core.frame.DataFrame'>


Dataset({
    features: ['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer'],
    num_rows: 200
})

These lines import the necessary libraries:

'pandas' for data manipulation using DataFrames.
'AutoTokenizer' from the transformers library, which is part of the Hugging Face Transformers library for working with pre-trained models.
Dataset from the datasets library, which is also part of Hugging Face and provides a convenient way to work with datasets.

Python code reads a CSV file into a pandas DataFrame and then converts that DataFrame into a Hugging Face dataset using the datasets library

Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our task is about mutliple choice, we use the AutoModelForMultipleChoice class. Like with the tokenizer, the from_pretrained method will download and cache the model for us.

In [4]:
model_checkpoint = "bert-base-uncased"
model_dir = "bert-base-uncased"
batch_size = 16

from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Firstly we initialize the model checkpoint ,directory and the bsatch size.
 these variables determine the directories where the model is stored,
 batch size refers to the number of training examples used in one iteration.
 Then we import three classes from the transformers library
 'model' variable loads the pre trained weights using the specified model checkpoints.
 then the code imports auto tokenizers
 variable initializes a tokenizer (tokenizer) using the AutoTokenizer class. It loads the pre-trained tokenizer corresponding to the specified model_checkpoint. The use_fast=True parameter activates the fast tokenizer, which is a more efficient implementation.

 Code sets up variables for model checkpoint, model directory, and batch size. It then imports necessary classes from the transformers library and initializes a model and tokenizer for multiple-choice tasks based on the specified model checkpoint

The AutoModel class is part of the Hugging Face Transformers library and is designed to automatically load any pre-trained model based on its name or path

In [5]:
model_name = model_checkpoint.split("/")[-1]

In [56]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    print(example)
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
print(type(tokenized_train_ds))

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

{'id': 0, 'prompt': 'Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?', 'A': 'MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."', 'B': 'MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.', 'C': 'MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.', 'D': 'MOND is a theory that reduces the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 2.', 'E': 'MOND is a t


options = 'ABCDE': This line creates a string containing the options for a multiple-choice question. In this case, the options are 'A', 'B', 'C', 'D', and 'E'.
indices = list(range(5)): This line creates a list of indices corresponding to the options. In this case, it creates the list [0, 1, 2, 3, 4].
option_to_index = {option: index for option, index in zip(options, indices)}: This line creates a dictionary (option_to_index) mapping each option to its corresponding index.
index_to_option = {index: option for option, index in zip(options, indices)}: This line creates a dictionary (index_to_option) mapping each index to its corresponding option.
These dictionaries (option_to_index and index_to_option) are  used for converting between option names and numerical indices

def preprocess(example):: This line defines a function named preprocess that takes an example as input. first_sentence = [example['prompt']] * 5: This line creates a list (first_sentence) by repeating the prompt of the example 5 times. This is  because the model expects a set of question/answer pairs, and here, each option will be paired with the same question.

second_sentence = []: This line initializes an empty list

(second_sentence), which will be filled with the text of each option. for option in options:: This line starts a loop over each option ('A', 'B', 'C', 'D', 'E'). second_sentence.append(example[option]): Inside the loop, the text of each option is added to the second_sentence list.

tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True): This line tokenizes the pairs of questions and options using a tokenizer (assumed to be defined elsewhere in the code) and sets truncation=True to handle cases where the tokenized text exceeds a certain length. tokenized_example['label'] = option_to_index[example['answer']]: This line adds a 'label' key to the tokenized_example dictionary, representing the index of the correct answer option.

return tokenized_example: The function returns the tokenized example

In [14]:
# Following datacollator (adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice)
# will dynamically pad our questions at batch-time so we don't have to make every question the length
# of our longest question.
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

Importing various classes like 'dataclass' ,pytorch library, padding strategy etc.

'Call' function is called as an instance of class because it flattens the features to make them suitable for processing multiple-choice datasets. It creates a list of dictionaries where each dictionary represents a choice for each example.

Then it uses the 'pad' method of the tokenizer to pad the flattened features. This is where the padding strategy specified earlier is applied. The resulting batch is returned as PyTorch tensors.
then the tensors are reshaped in the batch to have correct dimensions.

It creates a new key 'labels' in the batch, containing the labels converted to a PyTorch tensor.
The final batch is then returned
This DataCollatorForMultipleChoice class is designed to be used with Hugging Face Transformers library for processing multiple-choice datasets during training. It takes a list of features, extracts labels, flattens the features, pads them, and returns a batch suitable for training a model on a multiple-choice task


In [15]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
model = AutoModelForMultipleChoice.from_pretrained(model_dir)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AutoModelForMultipleChoice: This class allows you to load a pre-trained model specifically designed for multiple-choice tasks. It automatically identifies the correct model architecture based on the provided identifier.
TrainingArguments: This class holds hyperparameters and other settings for training a model. It is used to configure the training process.
Trainer: This class facilitates the training of a model using Hugging Face's training loop. It takes care of handling data loading, optimization, and other training-related tasks.
AutoModelForMultipleChoice.from_pretrained(model_dir): initializes a model for multiple-choice tasks by loading pre-trained weights from the directory specified by model_dir. The AutoModelForMultipleChoice class automatically identifies the model architecture based on the content of the directory

these lines of code set up the necessary classes and instantiate a pre-trained model for multiple-choice tasks. The from_pretrained method loads the pre-trained weights of the model, making it ready for further fine-tuning or evaluation on multiple-choice datasets.

In [16]:
model_dir = 'finetuned_bert'
training_args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none'
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)

Trainer object, which is part of the Hugging Face Transformers library. The Trainer class facilitates the training of a mode
initializing various parameters which will specify the model that will be trained also the training arguments ,tokenizers,provides the evaluation dataset to the 'trainer' method

The Trainer object is configured with the necessary components, and you can use it to train the model by calling the trainer.train() method. The Trainer will handle the training loop, data loading, and other aspects of the training process based on the provided configuration.

In [18]:
trainer.train()

  0%|          | 0/150 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.5099483728408813, 'eval_runtime': 128.2979, 'eval_samples_per_second': 1.559, 'eval_steps_per_second': 0.39, 'epoch': 1.0}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.1821080446243286, 'eval_runtime': 127.545, 'eval_samples_per_second': 1.568, 'eval_steps_per_second': 0.392, 'epoch': 2.0}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 0.993130624294281, 'eval_runtime': 120.739, 'eval_samples_per_second': 1.656, 'eval_steps_per_second': 0.414, 'epoch': 3.0}
{'train_runtime': 1607.0091, 'train_samples_per_second': 0.373, 'train_steps_per_second': 0.093, 'train_loss': 1.4085071818033854, 'epoch': 3.0}


TrainOutput(global_step=150, training_loss=1.4085071818033854, metrics={'train_runtime': 1607.0091, 'train_samples_per_second': 0.373, 'train_steps_per_second': 0.093, 'train_loss': 1.4085071818033854, 'epoch': 3.0})

In [76]:
tokenized_train_ds

Dataset({
    features: ['id', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 200
})

In [19]:
# Now we can actually make predictions on our questions
predictions = trainer.predict(tokenized_train_ds)

  0%|          | 0/50 [00:00<?, ?it/s]

In [93]:
def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    print(example)
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example
    
example = {};
question = "What is the PH of H2O? "
A = ["7"]
B = ["8"]
C = ["9"]
D = ["10"]
E = ["6"]
answer = []
example['id'] = 0
example['prompt'] = question
example['A'] = A 
example['B'] = B 
example['C'] = C 
example['D'] = D
example['E'] = E 
example['answer'] = 'A'
example
exam = pd.DataFrame(example)
x = Dataset.from_pandas(exam)
tokenized_data = x.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
result = trainer.predict(tokenized_data)
# print(type(tokenized_data))
# print(result)

import numpy as np
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

predictions_to_map_output(result.predictions)



Map:   0%|          | 0/1 [00:00<?, ? examples/s]

{'id': 0, 'prompt': 'What is the PH of H2O? ', 'A': '7', 'B': '8', 'C': '9', 'D': '10', 'E': '6', 'answer': 'A'}


  0%|          | 0/1 [00:00<?, ?it/s]

array(['C B E'], dtype='<U5')

code is used to obtain predictions based on the provided dataset
predict(tokenized_train_ds): The predict method of the Trainer class is called, and it takes a dataset (tokenized_train_ds) as input. This method is used for making predictions on a dataset using the trained model.

predictions: The result of the predict method, which typically contains the model's predictions on the input dataset. The format of predictions depends on the task and the model. It might be a tuple or dictionary containing various information, such as predicted labels, probabilities, or logits.
code is generating predictions on the training dataset using the model that was previously trained using the Trainer. The specific format of predictions would need to be examined based on the task and model used in your case.

In [20]:
predictions

PredictionOutput(predictions=array([[-3.29500943e-01,  8.42755377e-01, -5.62532723e-01,
         9.99689579e-01, -3.91041785e-01],
       [ 2.35375857e+00,  2.11091256e+00,  2.19587922e+00,
         2.16951275e+00,  2.10738993e+00],
       [ 1.73730016e+00, -7.69336224e-01, -6.22683167e-01,
        -8.28998029e-01, -7.34583318e-01],
       [ 2.03671145e+00,  8.51344347e-01,  1.95951366e+00,
         1.33506691e+00,  6.72835946e-01],
       [ 6.53480411e-01,  6.23172998e-01,  2.70098776e-01,
         7.87054658e-01,  8.25151324e-01],
       [ 1.75513899e+00,  2.30984902e+00,  2.10423636e+00,
         1.26888466e+00,  1.93883777e+00],
       [ 2.05725241e+00,  1.88370109e+00,  2.07871509e+00,
         2.05981088e+00,  7.39846110e-01],
       [ 7.42096603e-01,  1.41246998e+00,  4.02848542e-01,
         1.43272853e+00,  1.28911662e+00],
       [-8.59663606e-01, -7.96784520e-01, -2.79428482e-01,
        -9.24487710e-01, -9.00720775e-01],
       [ 2.39825177e+00, -6.09537363e-01, -2.61861905

In [21]:
import numpy as np
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

defines a function predictions_to_map_output that takes a 2D array of predictions and converts them into a formatted output
 the function takes a 2D array of predictions, sorts the indices of the predictions, selects the top three indices for each row, converts the indices to their corresponding option names, and finally, joins the option names for each row into a space-separated string. The output is an array of strings representing the top predictions for each example.

In [23]:
predictions_to_map_output(predictions.predictions)

array(['D B A', 'A C D', 'A C E', 'A C D', 'E D A', 'B C E', 'C D A',
       'D B E', 'C B A', 'A E C', 'E A B', 'A D B', 'C B A', 'D E C',
       'B A E', 'B C A', 'E B C', 'E A B', 'D A B', 'E D B', 'D B C',
       'E D B', 'C D B', 'B A C', 'E A D', 'E B A', 'A D E', 'D C B',
       'E B A', 'C B E', 'B D E', 'E C D', 'E B D', 'D E A', 'C B A',
       'B D E', 'D E A', 'A D C', 'E D A', 'A E D', 'E A D', 'E C D',
       'B D A', 'D C B', 'A B D', 'A B D', 'B C D', 'C A B', 'D C E',
       'B A D', 'B C A', 'E D A', 'C A B', 'A C D', 'B A E', 'B E C',
       'C E B', 'C B A', 'D A E', 'B C A', 'B C E', 'D B E', 'B C E',
       'C A B', 'A E D', 'E D C', 'C D A', 'E A D', 'D C E', 'D E C',
       'A C E', 'A E D', 'E D C', 'B E A', 'D B E', 'B C D', 'C D B',
       'B E D', 'C B E', 'E A C', 'C D E', 'A C E', 'B D C', 'A E C',
       'C B A', 'D C A', 'D A E', 'B D A', 'E A C', 'D A B', 'B D A',
       'B C D', 'B D C', 'E D B', 'E D B', 'C D A', 'C D B', 'D B E',
       'D B A', 'D E

code takes the model predictions, processes them using the specified function, and likely returns a formatted output representing the top predicted answers for each example. The details of the output format depend on the specifics of the predictions_to_map_output

In [25]:
test_df = pd.read_csv('Data/test.csv')
test_df.head()

Unnamed: 0,id,prompt,A,B,C,D,E
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...


converting the csv test file into pandas data frame and
then displaying the first few rows using 'head' method

In [26]:
# There are more verbose/elegant ways of doing this, but if we give our test set a random `answer` column
# we can make predictions directly with our trainer.
test_df['answer'] = 'A'

# Other than that we'll preprocess it in the same way we preprocessed test.csv
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [79]:
# tokenized_test_ds
test_ds

Dataset({
    features: ['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer'],
    num_rows: 200
})


firstly,
code adds a new column named 'answer' to the test DataFrame (test_df) and assigns the constant value 'A' to every row in that column. The purpose of this step is to create a placeholder for the answer. This is done to align the test dataset structure with the structure used during training
code applies a preprocessing function (preprocess) to the test dataset using the map method. The preprocess function is  to perform tasks such as tokenization and removing unnecessary columns. Here, the batched=False parameter shows that the mapping is done element-wise (not in batches), and remove_columns is used to remove specified columns from the dataset. In this case, columns like 'prompt', 'A', 'B', 'C', 'D', 'E', and 'answer' are removed, leaving only the relevant information needed for making predictions.

After these steps, tokenized_test_ds is prepared for making predictions with the model. The 'answer' column, even though it was assigned a constant value, serves the purpose of maintaining the structure and consistency of the test dataset with the training data.

In [27]:
# Here we'll generate our "real" predictions on the test set
test_predictions = trainer.predict(tokenized_test_ds)

  0%|          | 0/50 [00:00<?, ?it/s]

.predict(tokenized_test_ds): This calls the predict method of the Trainer class, which is used for making predictions on a dataset. In this case, the dataset passed to it is tokenized_test_ds, which is the test dataset that has been tokenized and preprocessed.

test_predictions: This variable is assigned the result of the predict method. The specific format of test_predictions will depend on the task and the model architecture. Typically, it contains information such as predicted labels, probabilities, or logits.

After running this line, test_predictions should contain the model's predictions on the test dataset, and you can use this information for further analysis

In [28]:
# Now we can create our submission using the id column from test.csv
submission_df = test_df[['id']]
submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)

submission_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)


Unnamed: 0,id,prediction
0,0,D B A
1,1,A C D
2,2,A C E
3,3,A C D
4,4,E D A


creating a submission DataFrame that pairs the 'id' column from the test dataset with the corresponding model predictions, which have been processed and formatted using the predictions_to_map_output function. This submission_df is  for submission in a competition or as part of the evaluation process

In [40]:
# Once we write our submission file we're good to submit!
submission_df.to_csv('submission.csv', index=False)
submission_df

Unnamed: 0,id,prediction
0,0,D B A
1,1,A C D
2,2,A C E
3,3,A C D
4,4,E D A
...,...,...
195,195,C A E
196,196,B C E
197,197,B C A
198,198,C B D


In [62]:
length = len(submission_df)
totalCorrectPrediction = 0

for i in range(length):
    # print(train_df['answer'][i])
    if(submission_df['prediction'][i][0] == train_df['answer'][i]):
        totalCorrectPrediction += 1

print("Accuracy of Model is : ",totalCorrectPrediction/length)



Accuracy of Model is :  0.815


In [94]:
import pickle

with open('model_pickle','wb') as f:
    pickle.dump(trainer,f)


In [95]:
with open('model_pickle','rb') as f:
    model = pickle.load(f)

In [96]:
def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    print(example)
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example
    
example = {};
question = "What is the PH of H2O? "
A = ["7"]
B = ["8"]
C = ["9"]
D = ["10"]
E = ["6"]
answer = []
example['id'] = 0
example['prompt'] = question
example['A'] = A 
example['B'] = B 
example['C'] = C 
example['D'] = D
example['E'] = E 
example['answer'] = 'A'
example
exam = pd.DataFrame(example)
x = Dataset.from_pandas(exam)
tokenized_data = x.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
result = model.predict(tokenized_data)
# print(type(tokenized_data))
# print(result)

import numpy as np
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

predictions_to_map_output(result.predictions)



Map:   0%|          | 0/1 [00:00<?, ? examples/s]

{'id': 0, 'prompt': 'What is the PH of H2O? ', 'A': '7', 'B': '8', 'C': '9', 'D': '10', 'E': '6', 'answer': 'A'}


  0%|          | 0/1 [00:00<?, ?it/s]

array(['C B E'], dtype='<U5')

converting submission data frame csv file naming it 'submission.csv'