In [3]:
# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd
import urllib.request

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [42]:
def load_data():
    path = 'data/StudentsPerformance.csv'
    return pd.read_csv(path)

data = load_data()
data.head()

# data["test preparation course"].unique()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Convert all features into numerical values so we can calculate R values

In [12]:
# Convert everything into numbers for calculating Pearson's R
def genderToNum(row):
    if (row["gender"] == "male"):
        return 0
    else:
        return 1

def raceToNum(row):
    if row["race/ethnicity"][-1] == "A":
        return 1
    elif row["race/ethnicity"][-1] == "B":
        return 0.75
    elif row["race/ethnicity"][-1] == "C":
        return 0.5
    elif row["race/ethnicity"][-1] == "D":
        return 0.25
    elif row["race/ethnicity"][-1] == "E":
        return 0

def eduToNum(row):
    if row["parental level of education"] == "some high school":
        return 0
    elif row["parental level of education"] == "high school":
        return 0.2
    elif row["parental level of education"] == "some college":
        return 0.4
    elif row["parental level of education"] == "associate's degree":
        return 0.6
    elif row["parental level of education"] == "bachelor's degree":
        return 0.8
    elif row["parental level of education"] == "master's degree":
        return 1

def lunchToNum(row):
    if row["lunch"] == "standard":
        return 0
    else:
        return 1

def prepToNum(row):
    if row["test preparation course"] == "none":
        return 0
    else:
        return 1
        
# def binIt(row):
#     print(row["math score"])
#     if (row["math score"] >= 70):
#         return "A"
#     elif (row["math score"] >= 60):
#         return "B"
#     elif (row["math score"] >= 50):
#         return "C"
#     elif (row["math score"] >= 40):
#         return "D"
#     else:
#         return "F"

data["gender"] = data.apply(genderToNum, axis=1)
data["race/ethnicity"] = data.apply(raceToNum, axis=1)
data["parental level of education"] = data.apply(eduToNum, axis=1)
data["lunch"] = data.apply(lunchToNum, axis=1)
data["test preparation course"] = data.apply(prepToNum, axis=1)


data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,1,0.75,0.8,0,0,72,72,74
1,1,0.50,0.4,0,1,69,90,88
2,1,0.75,1.0,0,0,90,95,93
3,0,1.00,0.6,1,0,47,57,44
4,0,0.50,0.4,0,0,76,78,75
...,...,...,...,...,...,...,...,...
995,1,0.00,1.0,0,1,88,99,95
996,0,0.50,0.2,1,0,62,55,55
997,1,0.50,0.2,1,1,59,71,65
998,1,0.25,0.4,0,1,68,78,77


Find and order the Pearson's R values for each feature against each math/reading/writing score

In [13]:
import math
import statistics

def printDict(list):
    for x in list:
        print("{}:  {}".format(x[0], x[1]))
    print("\n")

# Get the Pearson's R value between each feature and the different scores
mathlist = {}
for x in data.keys():
    if x == "math score":
        continue
    mathlist[x] = abs(np.corrcoef(data[x], data["math score"])[0][1])
mathlist = sorted(mathlist.items(), key=lambda x:x[1], reverse=True)
print("Features against Math scores")
printDict(mathlist)

readinglist = {}
for x in data.keys():
    if x == "reading score":
        continue
    readinglist[x] = abs(np.corrcoef(data[x], data["reading score"])[0][1])
readinglist = sorted(readinglist.items(), key=lambda x:x[1], reverse=True)
print("Features against Reading scores")
printDict(readinglist)

writinglist = {}
for x in data.keys():
    if x == "writing score":
        continue
    writinglist[x] = abs(np.corrcoef(data[x], data["writing score"])[0][1])
writinglist = sorted(writinglist.items(), key=lambda x:x[1], reverse=True)
print("Features against Writing scores")
printDict(writinglist)

Features against Math scores
reading score:  0.8175796636720544
writing score:  0.8026420459498085
lunch:  0.3508766455918607
race/ethnicity:  0.21641544829808895
test preparation course:  0.17770246930439465
gender:  0.1679822381003558
parental level of education:  0.15943181815735608


Features against Reading scores
writing score:  0.9545980771462478
math score:  0.8175796636720544
gender:  0.24431260787747192
test preparation course:  0.24178043354875134
lunch:  0.2295603216622811
parental level of education:  0.1909082647642037
race/ethnicity:  0.14525262214153506


Features against Writing scores
reading score:  0.9545980771462478
math score:  0.8026420459498085
test preparation course:  0.31294628448595596
gender:  0.3012249355007125
lunch:  0.2457686763842185
parental level of education:  0.23671517332205244
race/ethnicity:  0.16569051050724565




Univariate Selection for top features

In [14]:
# This was all essentially from the recommended webpage on feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

x = data.iloc[:,0:8]    # Select all columns for the selection process
x = x.drop(columns=["math score"])  # drop math score from the selection process as it is the target column
y = data.iloc[:,-3] # The target column is the math score

bestfeatures = SelectKBest(score_func=chi2, k=6)
fit = bestfeatures.fit(x,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)
featureScores = pd.concat([dfcolumns,dfscores], axis=1)
featureScores.columns = ['features', 'score']
print(featureScores.nlargest(6,'score'))

                  features        score
6            writing score  2289.670475
5            reading score  2136.299236
3                    lunch   118.122326
4  test preparation course    56.923719
0                   gender    44.949761
1           race/ethnicity    20.140823


Notably, test preparation course is considered to have a stronger relationship to math score in the Univariate Selection than in the Pearson's R method

Assuming we are focussing on the maths score as the target attribute, we will create data sets of sizes 6, 4, and 2 of the features that correlate strongest with the maths score. The new data sets are written to separate csv files

This uses the correlation scores from Univariate Selection

In [15]:
import csv

# Top 2
# writing, reading
topTwo = data[["writing score", "reading score"]]
topTwo.to_csv('data/topTwoStudentPerf.csv', index=False)

# Top 4
# writing, reading, lunch, test prep
topFour = data[["writing score", "writing score", "lunch", "test preparation course"]]
topFour.to_csv('data/topFourStudentPerf.csv', index=False)

# Top 6
# writing, reading, lunch, test prep, gender, ethnicity
topSix = data[["writing score", "writing score", "lunch", "test preparation course", "gender", "race/ethnicity"]]
topSix.to_csv('data/topSixStudentPerf.csv', index=False)

Use test/train split to separate the data and use linear regression as the classifier

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

diffDataSets = [topTwo, topFour, topSix, data]
dataSetNames = ["topTwo", "topFour", "topSix", "original data set"]      # This is for easier output interpretation
target = data["math score"]
# d = data.drop(columns=["math score"])   # Remove the target column from the data

count = 0
for x in diffDataSets:
    x_train, x_test, y_train, y_test = train_test_split(x, target, random_state=0)  # split the data in to test and train

    lr = LinearRegression().fit(x_train, y_train)   # Create a linear regression classifier and fit the train data into the model
    y_predlr = lr.predict(x_test)   # predict the math score on the test data

    print("Test set predictions for {}:\n{}".format(dataSetNames[count], y_predlr)) # print out the predictions for the test data 

    print("Test set score: {:.2f}\n\n".format(lr.score(x_test, y_test))) # evaluate the accuracy  of your classifier 
    # the score function computes the success 
    count = count + 1

Test set predictions for topTwo:
[69.3173217  69.51001596 47.21659852 60.64406104 73.62176902 68.19293535
 67.61485258 43.49023397 80.52819454 36.35893119 48.95125065 67.13291503
 76.22374722 75.80617569 55.44010464 42.43021365 48.72637338 84.44725334
 53.51275825 74.74615537 61.89677562 56.91769649 73.2363805  57.78502256
 45.8673349  61.89677562 53.93032978 45.25706912 67.06854901 84.02968181
 80.75307181 80.62474357 87.08141455 59.74455196 55.6327989  76.06323597
 93.15269701 55.60061589 73.8788293  69.99195351 62.37871317 54.12302404
 68.3534466  74.29640083 78.40815389 58.26696011 84.67213061 60.64406104
 79.56472325 60.8367553  38.06140031 59.74455196 50.01127098 54.76547283
 41.9482761  70.85927958 53.51275825 60.61187803 71.69442263 62.34653016
 65.17338563 75.16372689 53.89814677 71.40517934 64.53093683 65.36607989
 68.61050688 53.0308207  67.13291503 50.87859704 68.44999563 79.17893092
 69.51001596 66.68316049 54.09084103 62.31434715 68.25730138 67.13291503
 31.79742359 74.93