In [23]:
from sklearn.svm import SVC 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr,pointbiserialr,iqr
from sklearn.preprocessing  import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score , confusion_matrix,classification_report,root_mean_squared_error, r2_score
import numpy as np
from imblearn.combine import SMOTEENN
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import shap
import json
pd.set_option('display.max_columns', None)

In [24]:
training_path="challenge_2_dataset/training.json"
testing_path="challenge_2_dataset/test.json"
ground_truth_path="challenge_2_dataset/ground_truth.json"


In [25]:
with open(training_path,"r") as file:
    training_data=[json.loads(line) for line in file.readlines()[1:]]
training_df=pd.DataFrame(training_data)


In [26]:
training_df.head(10)

Unnamed: 0,Physics,Chemistry,PhysicalEducation,English,Mathematics,serial,Biology,Accountancy,BusinessStudies,Economics,ComputerScience
0,8.0,7.0,3.0,4,6,195490,,,,,
1,1.0,1.0,1.0,3,3,190869,,,,,
2,1.0,2.0,2.0,1,2,3111,,,,,
3,8.0,7.0,6.0,7,7,47738,,,,,
4,1.0,1.0,1.0,3,2,85520,,,,,
5,2.0,1.0,,4,8,182318,2.0,,,,
6,3.0,4.0,5.0,5,8,77482,,,,,
7,,,,6,7,152940,,2.0,5.0,3.0,
8,5.0,6.0,,3,8,132620,7.0,,,,
9,,,,3,2,179461,,2.0,1.0,1.0,


In [27]:
required_subject_combinations = [
    {"Physics", "Chemistry", "ComputerScience"},
    {"Physics", "Chemistry", "PhysicalEducation"},
    {"Physics", "Chemistry", "Economics"},
    {"Physics", "Chemistry", "Biology"},
    {"Economics", "Accountancy", "BusinessStudies"}
]


def matches_refined_combination(row, combinations):
    non_null_subjects = {col for col in row.index if not pd.isnull(row[col]) and col != "English"}
    for combination in combinations:
        if combination.issubset(non_null_subjects):
            return True
    return False

filtered_training_df = training_df[
    training_df.apply(lambda row: matches_refined_combination(row, required_subject_combinations), axis=1)
]


filtered_training_df

Unnamed: 0,Physics,Chemistry,PhysicalEducation,English,Mathematics,serial,Biology,Accountancy,BusinessStudies,Economics,ComputerScience
0,8.0,7.0,3.0,4,6,195490,,,,,
1,1.0,1.0,1.0,3,3,190869,,,,,
2,1.0,2.0,2.0,1,2,3111,,,,,
3,8.0,7.0,6.0,7,7,47738,,,,,
4,1.0,1.0,1.0,3,2,85520,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
79460,2.0,2.0,,3,1,705,,,,,3.0
79461,6.0,4.0,,5,5,155605,3.0,,,,
79462,,,,3,5,55508,,2.0,3.0,3.0,
79463,2.0,2.0,1.0,1,1,136471,,,,,


In [28]:
filtered_training_df=filtered_training_df.drop(['serial'],axis=1)


In [29]:
filtered_training_df.columns

Index(['Physics', 'Chemistry', 'PhysicalEducation', 'English', 'Mathematics',
       'Biology', 'Accountancy', 'BusinessStudies', 'Economics',
       'ComputerScience'],
      dtype='object')

In [30]:
def find_group(row):
    # Get columns with non-NaN values
    non_nan_columns = set(row.index[row.notna()])
    
    # Check which group matches
    for idx, combination in enumerate(required_subject_combinations, start=1):
        if combination.issubset(non_nan_columns):
            return f"group_{idx}"
    return "unknown"

filtered_training_df['group'] = filtered_training_df.apply(find_group, axis=1)

In [31]:
filtered_training_df.head()

Unnamed: 0,Physics,Chemistry,PhysicalEducation,English,Mathematics,Biology,Accountancy,BusinessStudies,Economics,ComputerScience,group
0,8.0,7.0,3.0,4,6,,,,,,group_2
1,1.0,1.0,1.0,3,3,,,,,,group_2
2,1.0,2.0,2.0,1,2,,,,,,group_2
3,8.0,7.0,6.0,7,7,,,,,,group_2
4,1.0,1.0,1.0,3,2,,,,,,group_2


In [32]:
filtered_training_df["group"].unique()

array(['group_2', 'group_4', 'group_5', 'group_1', 'group_3'],
      dtype=object)