In [122]:
import numpy as np
import pandas as pd 
import seaborn as sns 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder  



# Read the dataset
MCQ = pd.read_csv('results.csv', encoding='latin1')
MCQ.head(18)

Unnamed: 0,Student_ID,Question,Round,Category,Difficulty_Level,Keywords,Correctness
0,660e224f2577d04ccfbecc71,Which one of the following statements is true ...,1,Organic,Easy,alcohol,0
1,660e224f2577d04ccfbecc71,Which one of the following reagents could be u...,1,Organic,Medium,"CH3CH2CHO ,HCHO",0
2,660e224f2577d04ccfbecc71,"Which arrangement of compounds given below, gi...",1,Organic,Hard,"increasing ,acid",0
3,660e224f2577d04ccfbecc71,Which of the following statements is true with...,1,Inorganic,Easy,"s-block ,Group I , Group II ,periodic table",1
4,660e224f2577d04ccfbecc71,What is the electron configuration of chromium...,1,Inorganic,Easy,"Electron configuration, Chromium",1
5,660e224f2577d04ccfbecc71,Which of the following compounds gives NO? on ...,1,Inorganic,Medium,"Heating, Nitrites, Nitrate compounds, NO?",0
6,660e224f2577d04ccfbecc71,An ideally behaving gas of relative molecular ...,1,Physical,Medium,"relative molecular mass, temperature, ideally...",1
7,660e224f2577d04ccfbecc71,At a temperature of 300 K and under a pressure...,1,Physical,Medium,"relative atomic mass,gas,temperature",0
8,660e224f2577d04ccfbecc71,1 mol of a gas at 27°C and under a certain pre...,1,Physical,Hard,"gas,temperature , pressure,volume",1
9,661e3d0b1741a36959ce1624,On heating the product obtained from the react...,1,Organic,Medium,"organic ,ammonia,P2O5,alkyl cyanide",1


In [123]:
    
# Using pandas get_dummies for one-hot encoding
MCQ = pd.get_dummies(MCQ, columns=['Category', 'Difficulty_Level'])

# You might want to drop the original categorical columns if not needed
# MCQ.drop(['Category', 'Difficulty_Level'], axis=1, inplace=True)

In [124]:
print(MCQ.columns)

Index(['Student_ID', 'Question', 'Round', 'Keywords', 'Correctness',
       'Category_Inorganic', 'Category_Organic', 'Category_Physical',
       'Difficulty_Level_Easy', 'Difficulty_Level_Hard',
       'Difficulty_Level_Medium'],
      dtype='object')


In [125]:
# Print the entire DataFrame
print(MCQ)

                  Student_ID  \
0   660e224f2577d04ccfbecc71   
1   660e224f2577d04ccfbecc71   
2   660e224f2577d04ccfbecc71   
3   660e224f2577d04ccfbecc71   
4   660e224f2577d04ccfbecc71   
..                       ...   
94  660e224f2577d04ccfbecc71   
95  660e224f2577d04ccfbecc71   
96  660e224f2577d04ccfbecc71   
97  660e224f2577d04ccfbecc71   
98  660e224f2577d04ccfbecc71   

                                             Question  Round  \
0   Which one of the following statements is true ...      1   
1   Which one of the following reagents could be u...      1   
2   Which arrangement of compounds given below, gi...      1   
3   Which of the following statements is true with...      1   
4   What is the electron configuration of chromium...      1   
..                                                ...    ...   
94  Which one of the following can be used to dist...      3   
95  Which of the following react / reacts with aqu...      3   
96  The concentration of a gas at press

In [126]:
from sklearn.model_selection import train_test_split

In [127]:
# Separate features (X) and labels (y)
X = MCQ_encoded.drop('Correctness', axis=1) 
y = MCQ_encoded['Correctness']


In [128]:
# Splitting the data into 60% train and 40% test
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

# Splitting 40% test into 50% validation and 50% final test
x_val, x_final_test, y_val, y_final_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [129]:
#checking if the data is split correctly
for dataset in [y_train,y_val,y_test]:
    print(round(len(dataset)/len(labels),2))

0.6
0.2
0.4


In [130]:
#writing all the data into csv files seperately

x_train.to_csv('train_features.csv',index=False)
x_val.to_csv('val_features.csv',index=False)
x_test.to_csv('test_features.csv',index=False)



y_train.to_csv('train_labels.csv',index=False)
y_val.to_csv('val_labels.csv',index=False)
y_test.to_csv('test_labels.csv',index=False)

In [131]:
# Convert all columns to numeric, coercing errors
numeric_MCQ = MCQ.apply(pd.to_numeric, errors='coerce')

# Find rows with NaN values (indicating non-numeric data)
non_numeric_rows = numeric_MCQ[numeric_MCQ.isnull().any(axis=1)]

print("Rows with non-numeric data:")
print(non_numeric_rows)

Rows with non-numeric data:
    Student_ID  Question  Round  Keywords  Correctness  Category_Inorganic  \
0          NaN       NaN      1       NaN            0               False   
1          NaN       NaN      1       NaN            0               False   
2          NaN       NaN      1       NaN            0               False   
3          NaN       NaN      1       NaN            1                True   
4          NaN       NaN      1       NaN            1                True   
..         ...       ...    ...       ...          ...                 ...   
94         NaN       NaN      3       NaN            1                True   
95         NaN       NaN      3       NaN            1                True   
96         NaN       NaN      3       NaN            1               False   
97         NaN       NaN      3       NaN            1               False   
98         NaN       NaN      3       NaN            1               False   

    Category_Organic  Category_Phys

In [132]:
MCQ_encoded.isnull().sum()

Question                               0
Round                                  0
Keywords                               0
Correctness                            0
Category_Inorganic                     0
Category_Organic                       0
Category_Physical                      0
Difficulty_Level_Easy                  0
Difficulty_Level_Hard                  0
Difficulty_Level_Medium                0
Student_ID_660e224f2577d04ccfbecc71    0
Student_ID_661e3d0b1741a36959ce1624    0
Student_ID_661e3d581741a36959ce162a    0
Student_ID_661e3da01741a36959ce1630    0
Student_ID_661e3dc01741a36959ce1634    0
dtype: int64