In [1]:
import numpy as np
import pandas as pd 
from sklearn import preprocessing

In [2]:
data = pd.read_csv('exams.csv')

In [3]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group D,high school,free/reduced,none,55,61,62
1,male,group D,some high school,standard,none,60,56,54
2,male,group C,some high school,standard,none,84,77,70
3,female,group C,associate's degree,free/reduced,none,49,62,59
4,female,group E,associate's degree,standard,none,100,100,100


In [4]:
# Overall look at dataset
data.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,67.108,70.065,68.882
std,15.057459,14.578179,15.237856
min,19.0,27.0,26.0
25%,57.0,60.0,58.0
50%,67.0,70.0,69.0
75%,78.0,81.0,80.0
max,100.0,100.0,100.0


In [5]:
# exploring the data set, and also to see if each column affects 
# the target variables to a noticeable degree, comparing the scores to the mean. 

data.groupby(['gender']).mean()

Unnamed: 0_level_0,math score,reading score,writing score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,64.782,73.794,73.53
male,69.434,66.336,64.234


In [6]:
data.groupby(['race/ethnicity']).mean()

Unnamed: 0_level_0,math score,reading score,writing score
race/ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
group A,66.506494,70.194805,68.480519
group B,62.964286,66.331633,64.693878
group C,64.583333,69.07716,67.25
group D,68.289683,71.265873,71.857143
group E,76.238411,74.960265,73.059603


In [7]:
data.groupby(['parental level of education']).mean()

Unnamed: 0_level_0,math score,reading score,writing score
parental level of education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
associate's degree,68.103261,72.217391,71.059783
bachelor's degree,71.327586,74.465517,75.034483
high school,64.93956,67.428571,65.549451
master's degree,72.648649,77.283784,76.864865
some college,67.109705,69.544304,68.64135
some high school,63.782609,66.019324,63.850242


In [8]:
data.groupby(['lunch']).mean()

Unnamed: 0_level_0,math score,reading score,writing score
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free/reduced,58.804598,64.195402,62.028736
standard,71.539877,73.197853,72.539877


In [9]:
data.groupby(['test preparation course']).mean()

Unnamed: 0_level_0,math score,reading score,writing score
test preparation course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
completed,70.621701,74.55132,75.454545
none,65.289833,67.743551,65.481032


In [10]:
# calculating mutual information for target variables 

def calc_MI(X,Y,bins):

   c_XY = np.histogram2d(X[:,0],Y,bins)[0]
   c_X = np.histogram(X,bins)[0]
   c_Y = np.histogram(Y,bins)[0]

   H_X = shan_entropy(c_X)
   H_Y = shan_entropy(c_Y)
   H_XY = shan_entropy(c_XY)

   MI = H_X + H_Y - H_XY
   return MI

def shan_entropy(c):
    c_normalized = c / float(np.sum(c))
    c_normalized = c_normalized[np.nonzero(c_normalized)]
    H = -sum(c_normalized* np.log2(c_normalized))  
    return H

bins = 10

In [11]:
# x = gender, y = math score

x = data['gender']
y = data['math score']

le = preprocessing.LabelEncoder()
le.fit(x)

x_1 = le.fit_transform(x) 
x_1 = x_1.reshape(-1, 1)

A = np.array(x_1)
n = A.shape[1]
matMI = np.zeros((n, n))

for ix in np.arange(n):
    for jx in np.arange(ix+1,n):
        matMI[ix,jx] = calc_MI(A[:,ix], A[:,jx], bins)
        
matMI = calc_MI(x_1, y, bins)
matMI


0.0288763725751493

In [12]:
# x = gender, y = reading score

y = data['reading score']

matMI = calc_MI(x_1, y, bins)
matMI

0.058822530443375154

In [13]:
# x = gender, y = writing score

y = data['writing score']
matMI = calc_MI(x_1, y, bins)
matMI

0.08240336046011043

In [14]:
# x = race/ethnicity, y = math score

x = data['race/ethnicity']
y = data['math score']

le = preprocessing.LabelEncoder()
le.fit(x)

x_2 = le.fit_transform(x) 
x_2 = x_2.reshape(-1, 1)

# since x changed, update A and the relevant variables

A = np.array(x_2)
n = A.shape[1]
matMI = np.zeros((n, n))

for ix in np.arange(n):
    for jx in np.arange(ix+1,n):
        matMI[ix,jx] = calc_MI(A[:,ix], A[:,jx], bins)

matMI = calc_MI(x_2, y, bins)
matMI

0.08370058914989809

In [15]:
# x = race/ethnicity, y = reading score

y = data['reading score']
matMI = calc_MI(x_2, y, bins)
matMI

0.04641904437207511

In [16]:
# x = race/ethnicity, y = writing score

y = data['writing score']
matMI = calc_MI(x_2, y, bins)
matMI

0.05863806767121815

In [18]:
# x = parental level of education, y = math score

x = data['parental level of education']
y = data['math score']

le = preprocessing.LabelEncoder()
le.fit(x)

x_3 = le.fit_transform(x) 
x_3 = x_3.reshape(-1, 1)

# since x changed, update A and the relevant variables

A = np.array(x_3)
n = A.shape[1]
matMI = np.zeros((n, n))

for ix in np.arange(n):
    for jx in np.arange(ix+1,n):
        matMI[ix,jx] = calc_MI(A[:,ix], A[:,jx], bins)
        
matMI = calc_MI(x_3, y, bins)
matMI

0.04854356733786869

In [19]:
# x = parental level of education, y = reading score

y = data['reading score']
matMI = calc_MI(x_3, y, bins)
matMI

0.060942298108336956

In [20]:
# x = parental level of education, y = writing score

y = data['writing score']
matMI = calc_MI(x_3, y, bins)
matMI

0.08030174412931412

In [21]:
# x = lunch, y = math score

x = data['lunch']
y = data['math score']

le = preprocessing.LabelEncoder()
le.fit(x)

x_4 = le.fit_transform(x) 
x_4 = x_4.reshape(-1, 1)

# since x changed, update A and the relevant variables

A = np.array(x_4)
n = A.shape[1]
matMI = np.zeros((n, n))

for ix in np.arange(n):
    for jx in np.arange(ix+1,n):
        matMI[ix,jx] = calc_MI(A[:,ix], A[:,jx], bins)

matMI = calc_MI(x_4, y, bins)
matMI

0.12909788096779318

In [22]:
# x = lunch, y = reading score

y = data['reading score']
matMI = calc_MI(x_4, y, bins)
matMI

0.07145065289743258

In [23]:
# x = lunch, y = writing score

y = data['writing score']
matMI = calc_MI(x_4, y, bins)
matMI

0.08216904086589238

In [24]:
# x = test preparation course, y = math score

x = data['test preparation course']
y = data['math score']

le = preprocessing.LabelEncoder()
le.fit(x)

x_5 = le.fit_transform(x) 
x_5 = x_5.reshape(-1, 1)

# since x changed, update A and the relevant variables

A = np.array(x_5)
n = A.shape[1]
matMI = np.zeros((n, n))

for ix in np.arange(n):
    for jx in np.arange(ix+1,n):
        matMI[ix,jx] = calc_MI(A[:,ix], A[:,jx], bins)

matMI = calc_MI(x_5, y, bins)
matMI

0.026565821358135455

In [25]:
# x = test preparation course, y = reading score

y = data['reading score']
matMI = calc_MI(x_5, y, bins)
matMI

0.04797702334524123

In [26]:
# x = test preparation course, y = writing score

y = data['writing score']
matMI = calc_MI(x_5, y, bins)
matMI

0.08331076156483963