In [None]:
"""
Hakan Güldal, hguldal@trakya.edu.tr, hguldal@gmail.com, 2024
Information Gain Analysis

Hardware Infrastructure: Google Compute Engine backend (GPU) System RAM: 51.0 GB NVidia T4 GPU
Software Infrastructure: Python, Scikit-Learn & Pandas Libraries

"""

import pandas as pd

# Read the CSV data file
df = pd.read_csv('data.csv')

from sklearn.feature_selection import mutual_info_classif

# Put all noncognitive features in the data into a list
featureList=['ST003D02T','ST003D03T','GRADE','ST001D01T','AGE','ST004D01T','MISCED','FISCED','HISCED','PAREDINT','OCOD1','OCOD2','BMMJ1','BFMJ2','HISEI','DURECEC','ISCEDP','PROGN','REPEAT','MISSSC','SKIPPING','TARDYSD','EXPECEDU','OCOD3','BSMJ','SISCO','COBN_S','COBN_M','COBN_F','IMMIG','LANGN','MATHMOT','MATHEASE','MATHPREF','EXERPRAC','STUDYHMW','WORKPAY','WORKHOME','ST250D06JA','ST250D07JA','ST251D08JA','ST251D09JA','ST330D10WA','HOMEPOS','ICTRES','INFOSEEK','BULLIED','FEELSAFE','TEACHSUP','RELATST','SCHRISK','BELONG','GROSAGR','ANXMAT','MATHEFF','MATHEF21','MATHPERS','FAMCON','ASSERAGR','COOPAGR','CURIOAGR','EMOCOAGR','EMPATAGR','PERSEVAGR','STRESAGR','EXPOFA','EXPO21ST','COGACRCO','COGACMCO','DISCLIM','FAMSUP','CREATFAM','CREATSCH','CREATEFF','CREATOP','IMAGINE','OPENART','CREATAS','CREATOOS','FAMSUPSL','FEELLAH','PROBSELF','SDLEFF','SCHSUST','LEARRES','ESCS','ICTAVSCH','ICTAVHOM','ICTDISTR','ICTSCH','ICTHOME','ICTQUAL','ICTSUBJ','ICTENQ','ICTFEED','ICTOUT','ICTWKDY','ICTWKEND','ICTREG','ICTINFO','ICTEFFIC','STUBMI','BODYIMA','SOCONPA','LIFESAT','PSYCHSYM','SOCCON','EXPWB','PAREXPT','CURSUPP','PQMIMP','PQMCAR','PARINVOL','PQSCHOOL','PASCHPOL','ATTIMMP','CREATHME','CREATACT','CREATOPN','CREATOR','FCFMLRTY','FLSCHOOL','FLMULTSB','FLFAMILY','ACCESSFP','FLCONFIN','FLCONICT','ACCESSFA','ATTCONFM','FRINFLFM']

# Define class label (Dependent variable)
classLabel='MS2_LEV'


In [None]:
# Convert numeric values ​​in dependent variable to text

df['MS2_LEV']=df['MS2_LEV'].replace(1,'Low')
df['MS2_LEV']=df['MS2_LEV'].replace(3,'High')

# Create a list to store information gain scores
results=[]

# loop through all noncognitive variables

for feature in featureList:

  print(feature)

  # Choose a specific noncognitive and pair it with the dependent variable.
  dfTemp=df[[feature,classLabel]]

  # Remove missing data if any in the data
  dfTemp=dfTemp.dropna()

  # Define the independent variables and dependent variable (X: noncognitive variables y: dependent variable (student math performance)
  X=dfTemp[[feature]]
  y=dfTemp[[classLabel]]

  # Calculate the information gain for each noncognitive variable and dependent variable pair in the loop

  if len(y)>0:
    mutual_info = mutual_info_classif(X, y)

  # Add the calculated score to the list
    results.append([dfTemp[dfTemp.columns[0]].count(),feature,mutual_info])

  else:
    results.append([dfTemp[dfTemp.columns[0]].count(),feature,-99999])

# Convert the list to Pandas dataframe
dfResults = pd.DataFrame(results, columns=['Instance','Feature', 'Score'])


In [None]:
# Sort the dataframe object containing the information gain values ​​from largest to smallest

dfResults.sort_values(by='Score', ascending=False)