In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime 
%matplotlib inline
from sklearn import preprocessing
import warnings
warnings.filterwarnings(action = 'ignore')

### Dataset
This project analyses students' achievements in secondary education of two Portuguese schools. The attributes of the dataset include student grades, demographic, social and school-related features) and it was collected by using school reports and questionnaires. Two datasets are provided regarding the performance in two distinct subjects: Mathematics (mat) and Portuguese language (por).

I classified the students into three categories, "excellent", "average", and "poor", according to their final exam performance. Then I analyzed a few features that have significant influence on students' final performance, including using the internet as a learning resource,Romantic Status, Alcohol Consumption, Parents Education Level, etc. Finally, using available predictive features, I have tried various machine learning models to predict students' final performance classification and have compared models performance based on ROC index.

Dataset available at: http://archive.ics.uci.edu/ml/datasets/Student+Performance#

In [2]:
train_data = pd.read_csv('../data/student-mat.csv', sep=";")
train_data_1 = pd.read_csv('../data/student-por.csv', sep=";")

In [3]:
train_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [4]:
train_data_1.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [5]:
train_data.shape


(395, 33)

In [6]:
train_data_1.shape

(649, 33)

In [7]:
print("The Number of Mathematics Student Data:",len(train_data))
print("The Number of Portuguese Student Data:",len(train_data_1))


The Number of Mathematics Student Data: 395
The Number of Portuguese Student Data: 649


In [8]:
# merge datasets
student_df = pd.concat([train_data, train_data_1])

In [9]:
student_df.shape

(1044, 33)

In [10]:
student_df.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [11]:
# rename column labels
student_df.columns = ['school','sex','age','address','family_size','parents_cohabitation_status','mother_education','father_education',
           'mother_job','father_job','reason','guardian','commute_time','study_time','failures','school_support',
          'family_support','paid_classes','activities','nursery','desire_higher_edu','internet','romantic','family_quality',
          'free_time','go_out_with_friends','weekday_alcohol_usage','weekend_alcohol_usage','health','absences','period_1_score','period_2_score','final_score']

In [12]:
#to look at the numerical fields and their describing mathematical values.
student_df.describe() 

Unnamed: 0,age,mother_education,father_education,commute_time,study_time,failures,family_quality,free_time,go_out_with_friends,weekday_alcohol_usage,weekend_alcohol_usage,health,absences,period_1_score,period_2_score,final_score
count,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0
mean,16.726054,2.603448,2.387931,1.522989,1.970307,0.264368,3.935824,3.201149,3.15613,1.494253,2.284483,3.543103,4.434866,11.213602,11.246169,11.341954
std,1.239975,1.124907,1.099938,0.731727,0.834353,0.656142,0.933401,1.031507,1.152575,0.911714,1.285105,1.424703,6.210017,2.983394,3.285071,3.864796
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,9.0,9.0,10.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


#### From the info available on the data, we can tell that the quality of the data is quite decent as there aren't any columns with null values and every cell has a single piece of data. This will significanlty simplify the processing stage of the data as we would not be required to compensate for null values or split dynamic data. However, there are many categorical fields in the data set and that requires some additional processing to generate better results from.

In [13]:
# look for missing values
student_df.isnull().sum()

school                         0
sex                            0
age                            0
address                        0
family_size                    0
parents_cohabitation_status    0
mother_education               0
father_education               0
mother_job                     0
father_job                     0
reason                         0
guardian                       0
commute_time                   0
study_time                     0
failures                       0
school_support                 0
family_support                 0
paid_classes                   0
activities                     0
nursery                        0
desire_higher_edu              0
internet                       0
romantic                       0
family_quality                 0
free_time                      0
go_out_with_friends            0
weekday_alcohol_usage          0
weekend_alcohol_usage          0
health                         0
absences                       0
period_1_s

In [14]:
# look for the sum of missing values
student_df.isnull().sum().sum()

0

In [15]:
##checking for duplicates
student_df.duplicated().sum()

0

In [16]:
student_df.columns

Index(['school', 'sex', 'age', 'address', 'family_size',
       'parents_cohabitation_status', 'mother_education', 'father_education',
       'mother_job', 'father_job', 'reason', 'guardian', 'commute_time',
       'study_time', 'failures', 'school_support', 'family_support',
       'paid_classes', 'activities', 'nursery', 'desire_higher_edu',
       'internet', 'romantic', 'family_quality', 'free_time',
       'go_out_with_friends', 'weekday_alcohol_usage', 'weekend_alcohol_usage',
       'health', 'absences', 'period_1_score', 'period_2_score',
       'final_score'],
      dtype='object')

#### Transforming Values and Types
##### Some columns have numbers that represent categorical values. I'm going to change the name in some of these columns to make clearer what they mean.

##### I'm also going to change the type of the columns labeled numeric when they actually are categories

In [17]:
student_df['commute_time'] = student_df['commute_time'].map({1: '<15m', 2: '15-30m', 3: '30-1h', 4: '>1h'})

student_df['study_time'] = student_df['study_time'].map({1: '<2h', 2: '2-5h', 3: '5-10h', 4: '>10h'})

In [18]:
student_df[['mother_education','father_education','family_quality','go_out_with_friends','weekday_alcohol_usage','weekend_alcohol_usage','health']] = \
student_df[['mother_education','father_education','family_quality','go_out_with_friends','weekday_alcohol_usage','weekend_alcohol_usage','health']].astype('object')

In [19]:
data = student_df.copy()

In [20]:
# data preprocessing

#school
data.loc[data['school']=='GP','school'] = 0
data.loc[data['school']=='MS','school'] = 1
print('school: ',student_df.school.unique(),' -> ',data.school.unique())

#sex
data.loc[data['sex']=='F','sex'] = 0
data.loc[data['sex']=='M','sex'] = 1
print('sex: ',student_df.sex.unique(),' -> ',data.sex.unique())

#address
data.loc[data['address']=='U','address'] = 0
data.loc[data['address']=='R','address'] = 1
print('address: ',student_df.address.unique(),' -> ',data.address.unique())

#family size 
data.loc[data['family_size']=='LE3','family_size'] = 0
data.loc[data['family_size']=='GT3','family_size'] = 1
print('family_size: ',student_df.family_size.unique(),' -> ',data.family_size.unique())

#parent's cohabitaiion
data.loc[data['parents_cohabitation_status']=='A','parents_cohabitation_status'] = 0
data.loc[data['parents_cohabitation_status']=='T','parents_cohabitation_status'] = 1
print('parents_cohabitation_status: ',student_df.parents_cohabitation_status.unique(),' -> ',data.parents_cohabitation_status.unique())

#mother's job
data.loc[data['mother_job']=='at_home','mother_job'] = 0
data.loc[data['mother_job']=='teacher','mother_job'] = 1
data.loc[data['mother_job']=='health','mother_job'] = 2
data.loc[data['mother_job']=='services','mother_job'] = 3
data.loc[data['mother_job']=='other','mother_job'] = 4
print('mother_job: ',student_df.mother_job.unique(),' -> ',data.mother_job.unique())

#father's job
data.loc[data['father_job']=='at_home','father_job'] = 0
data.loc[data['father_job']=='teacher','father_job'] = 1
data.loc[data['father_job']=='health','father_job'] = 2
data.loc[data['father_job']=='services','father_job'] = 3
data.loc[data['father_job']=='other','father_job'] = 4
print('father_job: ',student_df.father_job.unique(),' -> ',data.father_job.unique())

#reason
data.loc[data['reason']=='home','reason'] = 0
data.loc[data['reason']=='reputation','reason'] = 1
data.loc[data['reason']=='course','reason'] = 2
data.loc[data['reason']=='other','reason'] = 3
print('reason: ',student_df.reason.unique(),' -> ',data.reason.unique())

#guardian
data.loc[data['guardian']=='mother','guardian'] = 0
data.loc[data['guardian']=='father','guardian'] = 1
data.loc[data['guardian']=='other','guardian'] = 2
print('guardian: ',student_df.guardian.unique(),' -> ',data.guardian.unique())

#school support
data.loc[data['school_support']=='no','school_support'] = 0
data.loc[data['school_support']=='yes','school_support'] = 1
print('school_support: ',student_df.school_support.unique(),' -> ',data.school_support.unique())

#family support
data.loc[data['family_support']=='no','family_support'] = 0
data.loc[data['family_support']=='yes','family_support'] = 1
print('family_support: ',student_df.family_support.unique(),' -> ',data.family_support.unique())

#paid_classes
data.loc[data['paid_classes']=='no','paid_classes'] = 0
data.loc[data['paid_classes']=='yes','paid_classes'] = 1
print('paid_classes: ',student_df.paid_classes.unique(),' -> ',data.paid_classes.unique())

#activities
data.loc[data['activities']=='no','activities'] = 0
data.loc[data['activities']=='yes','activities'] = 1
print('activities: ',student_df.activities.unique(),' -> ',data.activities.unique())

#nursery
data.loc[data['nursery']=='no','nursery'] = 0
data.loc[data['nursery']=='yes','nursery'] = 1
print('nursery: ',student_df.nursery.unique(),' -> ',data.nursery.unique())

#higher edu
data.loc[data['desire_higher_edu']=='no','desire_higher_edu'] = 0
data.loc[data['desire_higher_edu']=='yes','desire_higher_edu'] = 1
print('desire_higher_edu: ',student_df.desire_higher_edu.unique(),' -> ',data.desire_higher_edu.unique())

#internet 
data.loc[data['internet']=='no','internet'] = 0
data.loc[data['internet']=='yes','internet'] = 1
print('internet: ',student_df.internet.unique(),' -> ',data.internet.unique())

#romantic 
data.loc[data['romantic']=='no','romantic'] = 0
data.loc[data['romantic']=='yes','romantic'] = 1
print('romantic: ',student_df.romantic.unique(),' -> ',data.romantic.unique())

school:  ['GP' 'MS']  ->  [0 1]
sex:  ['F' 'M']  ->  [0 1]
address:  ['U' 'R']  ->  [0 1]
family_size:  ['GT3' 'LE3']  ->  [1 0]
parents_cohabitation_status:  ['A' 'T']  ->  [0 1]
mother_job:  ['at_home' 'health' 'other' 'services' 'teacher']  ->  [0 2 4 3 1]
father_job:  ['teacher' 'other' 'services' 'health' 'at_home']  ->  [1 4 3 2 0]
reason:  ['course' 'other' 'home' 'reputation']  ->  [2 3 0 1]
guardian:  ['mother' 'father' 'other']  ->  [0 1 2]
school_support:  ['yes' 'no']  ->  [1 0]
family_support:  ['no' 'yes']  ->  [0 1]
paid_classes:  ['no' 'yes']  ->  [0 1]
activities:  ['no' 'yes']  ->  [0 1]
nursery:  ['yes' 'no']  ->  [1 0]
desire_higher_edu:  ['yes' 'no']  ->  [1 0]
internet:  ['no' 'yes']  ->  [0 1]
romantic:  ['no' 'yes']  ->  [0 1]


In [21]:
student_df.sample(5)

Unnamed: 0,school,sex,age,address,family_size,parents_cohabitation_status,mother_education,father_education,mother_job,father_job,...,family_quality,free_time,go_out_with_friends,weekday_alcohol_usage,weekend_alcohol_usage,health,absences,period_1_score,period_2_score,final_score
98,GP,F,16,U,GT3,T,4,4,other,other,...,5,3,4,1,2,1,4,12,13,13
109,GP,F,16,U,LE3,T,4,4,health,health,...,5,4,5,1,1,4,4,14,15,16
368,GP,M,18,U,LE3,T,4,4,other,other,...,4,2,5,3,4,5,2,8,9,11
420,GP,F,18,U,LE3,A,2,2,services,other,...,4,1,4,1,3,4,10,14,17,17
107,GP,M,16,U,GT3,T,3,3,services,other,...,5,3,3,1,1,5,4,13,14,14


#### Categorical Encoding
##### Linear Regression required that the attribute values be numerical. Therefore, columns with categorical data need to be encoded to a suitable numeric format. Attributes with 2 categories are encoded using binary encoding which converts the values to either 1 or 0. Attributes with more than 2 categories are encoded using one-hot encoding.

In [None]:
binary = ["sex", "family_size", "parents_cohabitation_status", "school_support", "family_support", "paid_classes", "activities", "internet", "romantic"]

In [None]:
multiple = ["mother_education", "father_education", "father_job", "mother_job", "reason"]

In [None]:
def binary_encoder(dataset, col):
    dataset[col] = dataset[col].astype('category')
    dataset[col] = dataset[col].cat.codes
    dataset[col] = dataset[col].astype('int')

In [None]:
df = pd.get_dummies(student_df, columns=multiple, prefix=multiple)

In [None]:
for col in binary:
    binary_encoder(df, col)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
# convert final_score to categorical variable 
# Excellent:15~20 Average:10~14 Poor:0~9
df['final_grade'] = 'na'
df.loc[(df.final_score >= 15) & (df.final_score <= 20), 'final_grade'] = 'excellent' 
df.loc[(df.final_score >= 10) & (df.final_score <= 14), 'final_grade'] = 'average' 
df.loc[(df.final_score >= 0) & (df.final_score <= 9), 'final_grade'] = 'poor' 
df.head(5)

In [None]:
df.head()

#### Next up, we will take the fields (columns) one by one to analyze their importance and effect on the final score value:

In [None]:
#Plotting the distribution of the final grades.
sns.distplot(df['final_grade']) 

In [None]:
#Finding out the number of urban and rural students
b = sns.countplot(df['address'])
b.axes.set_title('Number of urban and rural students', fontsize = 30)
b.set_xlabel('Address', fontsize = 20)
b.set_ylabel('Count', fontsize = 20)
plt.show()

In [None]:
# Grade distribution by address
sns.kdeplot(df.loc[df['address'] == 'U', 'G3'], label='Urban', shade = True)
sns.kdeplot(df.loc[df['address'] == 'R', 'G3'], label='Rural', shade = True)
plt.title('Urban students performance vs rural students performance', fontsize = 20)
plt.xlabel('Grade', fontsize = 20);
plt.ylabel('Density', fontsize = 20)
plt.show()

#### Not much of a difference between urban and rural students in this particular area

In [None]:
#Distribution of students going out
b = sns.countplot(df['goout'])
b.axes.set_title('Frequency of students going out with friends', fontsize = 30)
b.set_xlabel('Go out', fontsize = 20)
b.set_ylabel('Count', fontsize = 20)
plt.show()

#### We can see that most students goes out with their friends on average

#### From this we can tell that the distribution of the grades is decent and doesn't require any further skewness correction yet. We can go with this distribution for now to analyze the data and create a primitive model and it's error rate first. We can look into data processing of the final grade field afterwards if the results aren't satisfactory.

In [None]:
# only works on numerical variables
corr = df.corr() 
sns.heatmap(corr)

In [None]:
print (corr['final_score'].sort_values(ascending = False), '\n')

#### From the correlation graph above, we can look at the numerical fields to know the values that affect the end result the most. Obviously period_1_score and period_2_score are the most correlated fields to final score as they are part of the calculation formula for final score so they will have the greatest effect on our prediction. Another thing we can see is the negative correlation between failures and the final score result. This also makes quite a lot of sense as more failures tend to negatively affect your end score. Absences and free time seem to not be very relevant in the dataset that are analyzing which can be a flag that may help us further understand the data in the future.

#### Now that we have analyzed the numerical data slightly and figured out the most correlated fields, we now have to take a look at the categorical data to figure out how useful the fields may be and how to introduce them into the prediction model. The simplest way to analyze those fields is to compare the means accross the categories.

In [None]:
groupColumns = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup'
               , 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

avgColumns = ['G3', 'G2', 'G1']

In [None]:
school = data.groupby(groupColumns[0])[avgColumns].mean()
school.head()

#### From this, we see that Gabriel Pereira students generally do better than Mousinho da Silveira students. The same analysis can be done for a few more fields:

In [None]:
sex = data.groupby(groupColumns[1])[avgColumns].mean()
sex.head()

#### Correlation between Attributes
##### The heatmap shows the correlation between different attributes. We can use it to find which attributes are highle correlated with the target label and select them whereas we can also drop the features that are highly correlated to other features

In [None]:
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(15,12))

ax = sns.heatmap(data=df.corr(), ax=ax, cmap="Blues")
ax.set_xlabel('Features',fontdict={"fontsize":16})
ax.set_ylabel('Features',fontdict={"fontsize":16})
ax.set_title('Correlation between different Features', loc="center", fontdict={"fontsize": 16, "fontweight":"bold"})

plt.savefig("heatmap.png", bbox_inches="tight")
plt.show()

##### From the above heatmap, columns G1, G2 and G3 are highly correlated to each other. The below plots show this correlation.

In [None]:
pairplot = sns.pairplot(dataset[["G1", "G2", "G3"]], palette="viridis")

plt.savefig("pairplot.png", bbox_inches="tight")
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,8))

ax[0] = sns.lineplot(x="G1", y="G3", data=dataset, palette="viridis", ax=ax[0])
ax[0].set_xlabel('G1',fontdict={"fontsize":16})
ax[0].set_ylabel('G3',fontdict={"fontsize":16})
ax[0].set_title('G3 vs G1', loc="center", fontdict={"fontsize": 16, "fontweight":"bold"})

ax[1] = sns.lineplot(x="G2", y="G3", data=dataset, palette="viridis", ax=ax[1])
ax[1].set_xlabel('G2',fontdict={"fontsize":16})
ax[1].set_ylabel('G3',fontdict={"fontsize":16})
ax[1].set_title('G3 vs G2', loc="center", fontdict={"fontsize": 16, "fontweight":"bold"})

plt.savefig("lineplot.png", bbox_inches="tight")
plt.show()

### analysing 'finale grade'

In [None]:
#describe "final grade"
df_train['G3'].describe()

In [None]:
#Distribution of Final grade of students
demo= sns.countplot(df_train['G3'])
demo.axes.set_title('Distribution of Final grade of students', fontsize = 35)
demo.set_xlabel('Final Grade', fontsize = 20)
demo.set_ylabel('Count', fontsize = 20)
plt.show()

### Apart from the high number of students scoring 0, the distribution is normal as expected.Maybe the value 0 is used in place of null. Or maybe the students who did not appear for the exam, or were not allowed to sit for the exam due to some reason are marked as 0. We cannot be sure

In [None]:
#Serialized representation
import pickle
model = 'LinearRegressionModel.sav'
pickle.dump(regressor, open(model, 'wb'))