In [30]:
import numpy as np
import pandas as pd 
import seaborn as sns 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder  



# Read the dataset
MCQ = pd.read_csv('results.csv', encoding='latin1')
MCQ.head(18)

Unnamed: 0,Student_ID,Question,Round,Category,Difficulty_Level,Keywords,Correctness
0,660e224f2577d04ccfbecc71,Which one of the following statements is true ...,1,Organic,Easy,alcohol,0
1,660e224f2577d04ccfbecc71,Which one of the following reagents could be u...,1,Organic,Medium,"CH3CH2CHO ,HCHO",0
2,660e224f2577d04ccfbecc71,"Which arrangement of compounds given below, gi...",1,Organic,Hard,"increasing ,acid",0
3,660e224f2577d04ccfbecc71,Which of the following statements is true with...,1,Inorganic,Easy,"s-block ,Group I , Group II ,periodic table",1
4,660e224f2577d04ccfbecc71,What is the electron configuration of chromium...,1,Inorganic,Easy,"Electron configuration, Chromium",1
5,660e224f2577d04ccfbecc71,Which of the following compounds gives NO? on ...,1,Inorganic,Medium,"Heating, Nitrites, Nitrate compounds, NO?",0
6,660e224f2577d04ccfbecc71,An ideally behaving gas of relative molecular ...,1,Physical,Medium,"relative molecular mass, temperature, ideally...",1
7,660e224f2577d04ccfbecc71,At a temperature of 300 K and under a pressure...,1,Physical,Medium,"relative atomic mass,gas,temperature",0
8,660e224f2577d04ccfbecc71,1 mol of a gas at 27°C and under a certain pre...,1,Physical,Hard,"gas,temperature , pressure,volume",1
9,661e3d0b1741a36959ce1624,On heating the product obtained from the react...,1,Organic,Medium,"organic ,ammonia,P2O5,alkyl cyanide",1


In [31]:
    
# Using pandas get_dummies for one-hot encoding
MCQ = pd.get_dummies(MCQ, columns=['Category', 'Difficulty_Level','Student_ID'])


In [32]:
# Print the entire DataFrame
print(MCQ)

                                             Question  Round  \
0   Which one of the following statements is true ...      1   
1   Which one of the following reagents could be u...      1   
2   Which arrangement of compounds given below, gi...      1   
3   Which of the following statements is true with...      1   
4   What is the electron configuration of chromium...      1   
..                                                ...    ...   
94  Which one of the following can be used to dist...      3   
95  Which of the following react / reacts with aqu...      3   
96  The concentration of a gas at pressure of 1 at...      3   
97  Describe the relationship between temperature ...      3   
98  250 cm^3 of oxygen was collected by the downwa...      3   

                                             Keywords  Correctness  \
0                                             alcohol            0   
1                                     CH3CH2CHO ,HCHO            0   
2                    

In [33]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'Question' column
question_tfidf = tfidf_vectorizer.fit_transform(MCQ['Question'])

# Convert the TF-IDF matrix into a DataFrame
question_tfidf_df = pd.DataFrame(question_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the original DataFrame with the new TF-IDF DataFrame
MCQ = pd.concat([MCQ, question_tfidf_df], axis=1)

# Drop the original 'Question' column if needed
MCQ.drop('Question', axis=1, inplace=True)

# Now repeat the same process for the 'Keywords' column if needed
keywords_tfidf = tfidf_vectorizer.fit_transform(MCQ['Keywords'])
keywords_tfidf_df = pd.DataFrame(keywords_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
MCQ = pd.concat([MCQ, keywords_tfidf_df], axis=1)
# Drop the original 'Keywords' column if needed
MCQ.drop('Keywords', axis=1, inplace=True)


In [34]:
print(MCQ.columns)

Index(['Round', 'Correctness', 'Category_Inorganic', 'Category_Organic',
       'Category_Physical', 'Difficulty_Level_Easy', 'Difficulty_Level_Hard',
       'Difficulty_Level_Medium', 'Student_ID_660e224f2577d04ccfbecc71',
       'Student_ID_661e3d0b1741a36959ce1624',
       ...
       'transitions', 'vapor', 'vaporization', 'variable', 'vessel', 'volume',
       'water', 'weak', 'white', 'with'],
      dtype='object', length=818)


In [38]:
from sklearn.model_selection import train_test_split

In [39]:
# Separate features (X) and labels (y)
features = MCQ.drop('Correctness', axis=1)
labels = MCQ['Correctness']

In [40]:
# Splitting the data into 60% train and 40% test
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

# Splitting 40% test into 50% validation and 50% final test
x_val, x_final_test, y_val, y_final_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [41]:
#checking if the data is split correctly
for dataset in [y_train,y_val,y_test]:
    print(round(len(dataset)/len(labels),2))

0.6
0.2
0.4


In [42]:
#writing all the data into csv files seperately

x_train.to_csv('train_features.csv',index=False)
x_val.to_csv('val_features.csv',index=False)
x_test.to_csv('test_features.csv',index=False)



y_train.to_csv('train_labels.csv',index=False)
y_val.to_csv('val_labels.csv',index=False)
y_test.to_csv('test_labels.csv',index=False)

In [43]:
# Convert all columns to numeric, coercing errors
numeric_MCQ = MCQ.apply(pd.to_numeric, errors='coerce')

# Find rows with NaN values (indicating non-numeric data)
non_numeric_rows = numeric_MCQ[numeric_MCQ.isnull().any(axis=1)]

print("Rows with non-numeric data:")
print(non_numeric_rows)

Rows with non-numeric data:
Empty DataFrame
Columns: [Round, Correctness, Category_Inorganic, Category_Organic, Category_Physical, Difficulty_Level_Easy, Difficulty_Level_Hard, Difficulty_Level_Medium, Student_ID_660e224f2577d04ccfbecc71, Student_ID_661e3d0b1741a36959ce1624, Student_ID_661e3d581741a36959ce162a, Student_ID_661e3da01741a36959ce1630, Student_ID_661e3dc01741a36959ce1634, 00, 08g, 10, 100, 1120, 12, 13, 14, 15, 16, 164, 18, 19, 20, 200, 2260, 23, 24, 25, 250, 27, 2b, 2chch2oh, 2chcho, 2choh, 30, 300, 32, 34g, 393, 3ccooch, 40, 4375, 50, 511, 60, 75, 750, 76g, 80, 88g, 98, about, above, acetone, acetyl, acid, acidic, acidified, acids, activation, added, after, ai, al, al2o3, alcohols, aldehydes, alkyl, all, aluminium, amalgam, ammonia, ammoniacal, among, amount, an, and, anhydride, another, appropriate, apprpriate, aqueous, are, arrangement, arrhenius, as, assume, at, atm, atmosphere, atmospheric, atomic, attain, ba, base, be, ...]
Index: []

[0 rows x 818 columns]


In [45]:
MCQ.isnull().sum()

Round                 0
Correctness           0
Category_Inorganic    0
Category_Organic      0
Category_Physical     0
                     ..
volume                0
water                 0
weak                  0
white                 0
with                  0
Length: 818, dtype: int64