In [80]:
import numpy as np
import pandas as pd 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder  



# Read the dataset
MCQ = pd.read_csv('results.csv', encoding='latin1')
MCQ.head(18)

Unnamed: 0,Student_ID,Round,Category,Difficulty_Level,Keywords,Correctness
0,660e224f2577d04ccfbecc71,1,Organic,Easy,alcohol,0
1,660e224f2577d04ccfbecc71,1,Organic,Medium,"CH3CH2CHO ,HCHO",0
2,660e224f2577d04ccfbecc71,1,Organic,Hard,"increasing ,acid",0
3,660e224f2577d04ccfbecc71,1,Inorganic,Easy,"s-block ,Group I , Group II ,periodic table",1
4,660e224f2577d04ccfbecc71,1,Inorganic,Easy,"Electron configuration, Chromium",1
5,660e224f2577d04ccfbecc71,1,Inorganic,Medium,"Heating, Nitrites, Nitrate compounds, NO?",0
6,660e224f2577d04ccfbecc71,1,Physical,Medium,"relative molecular mass, temperature, ideally...",1
7,660e224f2577d04ccfbecc71,1,Physical,Medium,"relative atomic mass,gas,temperature",0
8,660e224f2577d04ccfbecc71,1,Physical,Hard,"gas,temperature , pressure,volume",1
9,661e3d0b1741a36959ce1624,1,Organic,Medium,"organic ,ammonia,P2O5,alkyl cyanide",1


In [81]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Label encode 'Category' column
MCQ['Category_encoded'] = label_encoder.fit_transform(MCQ['Category'])
MCQ.drop('Category', axis=1, inplace=True)

# Label encode 'Difficulty_Level' column
MCQ['Difficulty_Level_encoded'] = label_encoder.fit_transform(MCQ['Difficulty_Level'])
MCQ.drop('Difficulty_Level', axis=1, inplace=True)

# Label encode 'Student_ID' column
MCQ['Student_ID_encoded'] = label_encoder.fit_transform(MCQ['Student_ID'])
MCQ.drop('Student_ID', axis=1, inplace=True)


In [82]:
print(MCQ.head(18))

    Round                                           Keywords  Correctness  \
0       1                                            alcohol            0   
1       1                                    CH3CH2CHO ,HCHO            0   
2       1                                   increasing ,acid            0   
3       1        s-block ,Group I , Group II ,periodic table            1   
4       1                   Electron configuration, Chromium            1   
5       1         Heating, Nitrites, Nitrate compounds, NO?             0   
6       1   relative molecular mass, temperature, ideally...            1   
7       1               relative atomic mass,gas,temperature            0   
8       1                  gas,temperature , pressure,volume            1   
9       1                organic ,ammonia,P2O5,alkyl cyanide            1   
10      1  Butanone,Dehydrogenation,Butan-2-ol,Oxidation,...            0   
11      1                                            alcohol            1   

In [83]:
# Split the 'Keywords' column into individual keywords
keywords_split = MCQ['Keywords'].str.split(',')

# Get unique keywords
unique_keywords = sorted(set(keyword for sublist in keywords_split for keyword in sublist))

# Create a DataFrame with one-hot encoded keywords
keyword_MCQ = pd.DataFrame(0, index=MCQ.index, columns=unique_keywords)

# Fill the one-hot encoded DataFrame
for i, keywords in enumerate(keywords_split):
    keyword_MCQ.loc[i, keywords] = 1

# Concatenate the one-hot encoded DataFrame with the original DataFrame
MCQ = pd.concat([MCQ, keyword_MCQ], axis=1)

# Drop the original 'Keywords' column
MCQ.drop(columns=['Keywords'], inplace=True)



In [84]:
print(MCQ.columns)

Index(['Round', 'Correctness', 'Category_encoded', 'Difficulty_Level_encoded',
       'Student_ID_encoded', ' ', '  N2', ' (CH3)2CHCHO', ' (CH3)2CHOH',
       ' (CH3)3CCOOCH ',
       ...
       'sodium amalgam', 'solubility', 'solution', 'system', 'temperature',
       'temperature ', 'titration', 'volume', 'weak base',
       'weak monobasic acids'],
      dtype='object', length=310)


In [85]:
from sklearn.model_selection import train_test_split

In [86]:
# Separate features (X) and labels (y)
features = MCQ.drop('Correctness', axis=1)
labels = MCQ['Correctness']

In [87]:
# Splitting the data into 60% train and 40% test
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

# Splitting 40% test into 50% validation and 50% final test
x_val, x_final_test, y_val, y_final_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [88]:
#checking if the data is split correctly
for dataset in [y_train,y_val,y_test]:
    print(round(len(dataset)/len(labels),2))

0.6
0.2
0.4


In [89]:
#writing all the data into csv files seperately

x_train.to_csv('train_features.csv',index=False)
x_val.to_csv('val_features.csv',index=False)
x_test.to_csv('test_features.csv',index=False)



y_train.to_csv('train_labels.csv',index=False)
y_val.to_csv('val_labels.csv',index=False)
y_test.to_csv('test_labels.csv',index=False)

In [90]:
# Convert all columns to numeric, coercing errors
numeric_MCQ = MCQ.apply(pd.to_numeric, errors='coerce')

# Find rows with NaN values (indicating non-numeric data)
non_numeric_rows = numeric_MCQ[numeric_MCQ.isnull().any(axis=1)]

print("Rows with non-numeric data:")
print(non_numeric_rows)

Rows with non-numeric data:
Empty DataFrame
Columns: [Round, Correctness, Category_encoded, Difficulty_Level_encoded, Student_ID_encoded,  ,   N2,  (CH3)2CHCHO,  (CH3)2CHOH,  (CH3)3CCOOCH ,  Al,  Arrhenius equation,  Ba(NO3)2,  C,  C6H5CH2CH2CH3,  C6H5CHO,  C6H5COCH2CH3,  CH3CH2CHO,  CH3CH2CHO ,  CO2,  Chromium,  Color,  Compounds,  Electronic transitions,  Elements,  F,  Fe,  FeBr3,  Gibbs free energy,  Group I,  Group II,  Group II ,  H,  H2O,  HBr,  HCHO,  HCl,  He,  Heating,  Highest,  ICH2COOH,  K ,  KI,  Li,  LiAlH4,  Mg,  Mg(NO3)2,  N,  NH3,  NO2,  NO? ,  Na,  NaBH4,  NaHCO3,  NaNH2,  Nitrate compounds,  Nitrites,  O,  Oxidation state,  STP,  Si,  Tollen's reagent,  acetyl chloride,  acidic strength,  activation energy,  alcohol,  alcohols,  aldehydes,  appropriateness,  appropriateness
,  aqueous KHCO3,  aqueous KOH,  aqueous NaOH,  aqueous sodium hydroxide,  bromine,  butane-1,  calcium carbide,  calculation,  carbon,  carbon dioxide,  carboxylic acids,  chemical tests,  chlor

In [91]:
MCQ.isnull().sum()

Round                       0
Correctness                 0
Category_encoded            0
Difficulty_Level_encoded    0
Student_ID_encoded          0
                           ..
temperature                 0
titration                   0
volume                      0
weak base                   0
weak monobasic acids        0
Length: 310, dtype: int64

In [94]:
import joblib

# Save the keyword_MCQ object using joblib
joblib.dump(keyword_MCQ, 'encoded_keywords.pkl')

['encoded_keywords.pkl']

['training_feature_names.pkl']