In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as logreg
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import warnings
warnings.filterwarnings('ignore')


data = pd.read_csv('LondonGCSEData1.csv') # import csv data here
print(data.head())

#get rid of first column
drop_nan = data.drop('URN', axis=1)
data = drop_nan


#Binarize dependent variable
meangrade = data['ATT8SCR'].mean()
sdgrade = data['ATT8SCR'].std()


data['GCSEgrade'] = np.where(data['ATT8SCR'] >= meangrade , '1', '0')
data.head()

      URN  SCHOOLTYPE  RELDENOM  ADMPOL_PT  EGENDER  TOTPUPS  TOTAL EXP  \
0  100049           4         1          2        3      950    9569373   
1  100050           4         1          2        2     1170    8525007   
2  100054           4         1          2        2     1025    7152177   
3  100056           4         1          2        1      836    6431289   
4  100051           4         1          2        3     1003    8306574   

         EXPPUP  PSEN_ALL  PTFSM6CLA1A  ...  PTEBACMAT_E_PTQ_EE  \
0  10073.024210      0.12         0.59  ...                0.98   
1   7286.330769      0.10         0.54  ...                0.99   
2   6977.733659      0.24         0.36  ...                1.00   
3   7692.929426      0.08         0.52  ...                0.98   
4   8281.728814      0.12         0.73  ...                1.00   

   PTEBAC2SCI_E_PTQ_EE  PTEBACHUM_E_PTQ_EE  PTEBACLAN_E_PTQ_EE  PTEALGRP2  \
0                 0.92                0.70                0.32       

Unnamed: 0,SCHOOLTYPE,RELDENOM,ADMPOL_PT,EGENDER,TOTPUPS,TOTAL EXP,EXPPUP,PSEN_ALL,PTFSM6CLA1A,PTEBACENG_E_PTQ_EE,...,PTEBAC2SCI_E_PTQ_EE,PTEBACHUM_E_PTQ_EE,PTEBACLAN_E_PTQ_EE,PTEALGRP2,KS2APS,ATT8SCR_17,P8MEA_17,P8MEA,ATT8SCR,GCSEgrade
0,4,1,2,3,950,9569373,10073.02421,0.12,0.59,0.94,...,0.92,0.7,0.32,0.71,27.6,38.4,-0.65,-0.11,42.4,0
1,4,1,2,2,1170,8525007,7286.330769,0.1,0.54,0.98,...,0.98,0.89,0.63,0.42,29.7,55.2,0.42,0.77,58.7,1
2,4,1,2,2,1025,7152177,6977.733659,0.24,0.36,0.99,...,0.99,0.65,0.87,0.55,30.8,63.5,0.65,0.73,63.4,1
3,4,1,2,1,836,6431289,7692.929426,0.08,0.52,0.99,...,0.99,0.98,0.87,0.59,29.7,47.6,-0.25,-0.44,46.3,0
4,4,1,2,3,1003,8306574,8281.728814,0.12,0.73,0.99,...,1.0,0.9,0.61,0.74,28.0,39.4,-0.16,-0.19,42.2,0


In [81]:
#Divide data set in train and validation
train, other = train_test_split(data, test_size=0.2, random_state=0)
validation, test = train_test_split(other, test_size=0.5, random_state=0)

print('The sizes for train, test, and validation is {}'.format((len(train), len(test), len(validation))))
data.head()


x_train = train.drop(columns=['GCSEgrade'])
y_train = train['GCSEgrade']

x_val = validation.drop(columns=['GCSEgrade'])
y_val = validation['GCSEgrade']

x_test = test.drop(columns=['GCSEgrade'])
y_test = test['GCSEgrade']


The sizes for train, test, and validation is (268, 34, 34)


In [82]:
#Get mean and stds for train set
x_means = x_train.mean(axis=0)
x_stds = x_train.std(axis=0)

# Standardise the splits.
x_train = (x_train - x_means) / x_stds
x_val = (x_val - x_means) / x_stds

print(x_train.head())


     SCHOOLTYPE  RELDENOM  ADMPOL_PT   EGENDER   TOTPUPS  TOTAL EXP    EXPPUP  \
17          NaN  0.249032  -0.055234  0.487758 -0.620295   0.066767  1.420624   
219         NaN -0.704404  -0.055234  0.487758 -0.306471  -0.787168 -1.066182   
92          NaN -0.704404  -0.055234  0.487758  0.459712   0.625267  0.193276   
278         NaN  1.202468  -0.055234  0.487758  0.408822  -0.261381 -1.121788   
191         NaN  1.202468  -0.055234  0.487758  0.335313   0.322518 -0.089939   

     PSEN_ALL  PTFSM6CLA1A  PTEBACENG_E_PTQ_EE  PTEBACMAT_E_PTQ_EE  \
17   1.664465     2.705018            0.859243            0.813731   
219 -0.647562     0.126520            0.135101           -0.536609   
92   1.230960    -0.066868           -0.226970            0.813731   
278 -0.647562    -1.807354            0.859243            0.813731   
191  0.652953    -0.002405           -1.313182           -1.211779   

     PTEBAC2SCI_E_PTQ_EE  PTEBACHUM_E_PTQ_EE  PTEBACLAN_E_PTQ_EE  PTEALGRP2  \
17           

In [83]:
#number of samples with GCSEgrade equal to 1
high_grade = sum(train['GCSEgrade'] == 1)
# total number of samples in the training set
total = len(train)

print(train['GCSEgrade'].head())
print(high_grade)
print(total)

17     0
219    0
92     0
278    1
191    0
Name: GCSEgrade, dtype: object
0
268


In [84]:
#Compute the Gini Impurity of decision node GCSEgrade
p = high_grade/total

gini = 2 * p * (1 - p)

print('Gini Impurity for the decision node defined by `GCSEgrade`: {}'.format(gini))

Gini Impurity for the decision node defined by `GCSEgrade`: 0.0
