In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime 
%matplotlib inline
from sklearn import preprocessing
import warnings
warnings.filterwarnings(action = 'ignore')

### Dataset
This project analyses students' achievements in secondary education of two Portuguese schools. The attributes of the dataset include student grades, demographic, social and school-related features) and it was collected by using school reports and questionnaires. Two datasets are provided regarding the performance in two distinct subjects: Mathematics (mat) and Portuguese language (por).

I classified the students into three categories, "excellent", "average", and "poor", according to their final exam performance. Then I analyzed a few features that have significant influence on students' final performance, including using the internet as a learning resource,Romantic Status, Alcohol Consumption, Parents Education Level, etc. Finally, using available predictive features, I have tried various machine learning models to predict students' final performance classification and have compared models performance based on ROC index.

Dataset available at: http://archive.ics.uci.edu/ml/datasets/Student+Performance#

In [None]:
train_data = pd.read_csv('../data/student-mat.csv', sep=";")
train_data_1 = pd.read_csv('../data/student-por.csv', sep=";")

In [None]:
train_data.head()

In [None]:
train_data_1.head()

In [None]:
train_data.shape


In [None]:
train_data_1.shape

In [None]:
print("The Number of Mathematics Student Data:",len(train_data))
print("The Number of Portuguese Student Data:",len(train_data_1))


#### Creating columns to represent classes for Mathematics and Portuguese classes respectively

In [None]:
train_data['subject']='mathematics'

In [None]:
train_data_1['subject']='portuguese'

In [None]:
# merge datasets
student_df = pd.concat([train_data, train_data_1], axis = 0)

#### saving the merge dataset

In [None]:
student_df.to_csv('/total_students.csv', index=False)

In [None]:
student_df.shape

In [None]:
student_df.dtypes

In [None]:
student_df.columns

In [None]:
# rename column labels
student_df.columns = ['school','sex','age','address','family_size','parents_cohabitation_status','mother_education','father_education',
           'mother_job','father_job','reason','guardian','commute_time','study_time','failures','school_support',
          'family_support','paid_classes','activities','nursery','desire_higher_edu','internet','romantic','family_quality',
          'free_time','go_out_with_friends','weekday_alcohol_usage','weekend_alcohol_usage','health','absences','first_period_score','second_period_score','final_score']

In [None]:
#to look at the numerical fields and their describing mathematical values.
student_df.describe() 

#### From the info available on the data, we can tell that the quality of the data is quite decent as there aren't any columns with null values and every cell has a single piece of data. This will significanlty simplify the processing stage of the data as we would not be required to compensate for null values or split dynamic data. However, there are many categorical fields in the data set and that requires some additional processing to generate better results from.

In [None]:
# look for missing values
student_df.isnull().sum()

In [None]:
# look for the sum of missing values
student_df.isnull().sum().sum()

In [None]:
##checking for duplicates
student_df.duplicated().sum()

In [None]:
data = student_df.copy()

In [None]:
# data preprocessing

#school
data.loc[data['school']=='GP','school'] = 0
data.loc[data['school']=='MS','school'] = 1
print('school: ',student_df.school.unique(),' -> ',data.school.unique())

#sex
data.loc[data['sex']=='F','sex'] = 0
data.loc[data['sex']=='M','sex'] = 1
print('sex: ',student_df.sex.unique(),' -> ',data.sex.unique())

#address
data.loc[data['address']=='U','address'] = 0
data.loc[data['address']=='R','address'] = 1
print('address: ',student_df.address.unique(),' -> ',data.address.unique())

#family size 
data.loc[data['family_size']=='LE3','family_size'] = 0
data.loc[data['family_size']=='GT3','family_size'] = 1
print('family_size: ',student_df.family_size.unique(),' -> ',data.family_size.unique())

#parent's cohabitaiion
data.loc[data['parents_cohabitation_status']=='A','parents_cohabitation_status'] = 0
data.loc[data['parents_cohabitation_status']=='T','parents_cohabitation_status'] = 1
print('parents_cohabitation_status: ',student_df.parents_cohabitation_status.unique(),' -> ',data.parents_cohabitation_status.unique())

#mother's job
data.loc[data['mother_job']=='at_home','mother_job'] = 0
data.loc[data['mother_job']=='teacher','mother_job'] = 1
data.loc[data['mother_job']=='health','mother_job'] = 2
data.loc[data['mother_job']=='services','mother_job'] = 3
data.loc[data['mother_job']=='other','mother_job'] = 4
print('mother_job: ',student_df.mother_job.unique(),' -> ',data.mother_job.unique())

#father's job
data.loc[data['father_job']=='at_home','father_job'] = 0
data.loc[data['father_job']=='teacher','father_job'] = 1
data.loc[data['father_job']=='health','father_job'] = 2
data.loc[data['father_job']=='services','father_job'] = 3
data.loc[data['father_job']=='other','father_job'] = 4
print('father_job: ',student_df.father_job.unique(),' -> ',data.father_job.unique())

#reason
data.loc[data['reason']=='home','reason'] = 0
data.loc[data['reason']=='reputation','reason'] = 1
data.loc[data['reason']=='course','reason'] = 2
data.loc[data['reason']=='other','reason'] = 3
print('reason: ',student_df.reason.unique(),' -> ',data.reason.unique())

#guardian
data.loc[data['guardian']=='mother','guardian'] = 0
data.loc[data['guardian']=='father','guardian'] = 1
data.loc[data['guardian']=='other','guardian'] = 2
print('guardian: ',student_df.guardian.unique(),' -> ',data.guardian.unique())

#school support
data.loc[data['school_support']=='no','school_support'] = 0
data.loc[data['school_support']=='yes','school_support'] = 1
print('school_support: ',student_df.school_support.unique(),' -> ',data.school_support.unique())

#family support
data.loc[data['family_support']=='no','family_support'] = 0
data.loc[data['family_support']=='yes','family_support'] = 1
print('family_support: ',student_df.family_support.unique(),' -> ',data.family_support.unique())

#paid_classes
data.loc[data['paid_classes']=='no','paid_classes'] = 0
data.loc[data['paid_classes']=='yes','paid_classes'] = 1
print('paid_classes: ',student_df.paid_classes.unique(),' -> ',data.paid_classes.unique())

#activities
data.loc[data['activities']=='no','activities'] = 0
data.loc[data['activities']=='yes','activities'] = 1
print('activities: ',student_df.activities.unique(),' -> ',data.activities.unique())

#nursery
data.loc[data['nursery']=='no','nursery'] = 0
data.loc[data['nursery']=='yes','nursery'] = 1
print('nursery: ',student_df.nursery.unique(),' -> ',data.nursery.unique())

#higher edu
data.loc[data['desire_higher_edu']=='no','desire_higher_edu'] = 0
data.loc[data['desire_higher_edu']=='yes','desire_higher_edu'] = 1
print('desire_higher_edu: ',student_df.desire_higher_edu.unique(),' -> ',data.desire_higher_edu.unique())

#internet 
data.loc[data['internet']=='no','internet'] = 0
data.loc[data['internet']=='yes','internet'] = 1
print('internet: ',student_df.internet.unique(),' -> ',data.internet.unique())

#romantic 
data.loc[data['romantic']=='no','romantic'] = 0
data.loc[data['romantic']=='yes','romantic'] = 1
print('romantic: ',student_df.romantic.unique(),' -> ',data.romantic.unique())

In [None]:
student_df.sample(5)

#### Convert the first period score, second period score and final score into grades


#### Excellent:14~20 Average:7~13 Poor:0~6

In [None]:
student_df['first_period_grade'] = 'na'
student_df.loc[(student_df.first_period_score >= 14) & (student_df.first_period_score <= 20), 'first_period_grade'] = 'excellent' 
student_df.loc[(student_df.first_period_score >= 7) & (student_df.first_period_score <= 13), 'first_period_grade'] = 'average' 
student_df.loc[(student_df.first_period_score >= 0) & (student_df.first_period_score <= 6), 'first_period_grade'] = 'poor' 
student_df.head(5)

In [None]:
student_df['second_period_grade'] = 'na'
student_df.loc[(student_df.second_period_score >= 14) & (student_df.second_period_score <= 20), 'second_period_grade'] = 'excellent' 
student_df.loc[(student_df.second_period_score >= 7) & (student_df.second_period_score <= 13), 'second_period_grade'] = 'average' 
student_df.loc[(student_df.second_period_score >= 0) & (student_df.second_period_score <= 6), 'second_period_grade'] = 'poor' 
student_df.head(5)

In [None]:
# convert final_score to categorical variable 
# Excellent:14~20 Average:7~13 Poor:0~6
student_df['final_grade'] = 'na'
student_df.loc[(student_df.final_score >= 14) & (student_df.final_score <= 20), 'final_grade'] = 'excellent' 
student_df.loc[(student_df.final_score >= 7) & (student_df.final_score <= 13), 'final_grade'] = 'average' 
student_df.loc[(student_df.final_score >= 0) & (student_df.final_score <= 6), 'final_grade'] = 'poor' 
student_df.head(5)

In [None]:
student_df.head()

#### Next up, we will take the fields (columns) one by one to analyze their importance and effect on the final score value:

In [None]:
#Plotting the distribution of the final grades.
sns.distplot(df['final_grade']) 

In [None]:
#Finding out the number of urban and rural students
b = sns.countplot(df['address'])
b.axes.set_title('Number of urban and rural students', fontsize = 30)
b.set_xlabel('Address', fontsize = 20)
b.set_ylabel('Count', fontsize = 20)
plt.show()

In [None]:
# Grade distribution by address
sns.kdeplot(df.loc[df['address'] == 'U', 'G3'], label='Urban', shade = True)
sns.kdeplot(df.loc[df['address'] == 'R', 'G3'], label='Rural', shade = True)
plt.title('Urban students performance vs rural students performance', fontsize = 20)
plt.xlabel('Grade', fontsize = 20);
plt.ylabel('Density', fontsize = 20)
plt.show()

#### Not much of a difference between urban and rural students in this particular area

In [None]:
#Distribution of students going out
b = sns.countplot(df['goout'])
b.axes.set_title('Frequency of students going out with friends', fontsize = 30)
b.set_xlabel('Go out', fontsize = 20)
b.set_ylabel('Count', fontsize = 20)
plt.show()

#### We can see that most students goes out with their friends on average

#### Comparing the students' grades by Sex

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
sex_index = pd.crosstab(index=student_df.final_grade, columns=student_df.sex)
sex_index = sex_index.apply(perc).reindex(index)
sex_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Sex', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Address

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
address_index = pd.crosstab(index=student_df.final_grade, columns=student_df.address)
address_index = address_index.apply(perc).reindex(index)
address_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Address', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Mother's Job

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
mother_job_index = pd.crosstab(index=student_df.final_grade, columns=student_df.mother_job)
mother_job_index = mother_job_index.apply(perc).reindex(index)
mother_job_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title("Grade vs Mother's Job", fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Father's Job

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
father_job_index = pd.crosstab(index=student_df.final_grade, columns=student_df.father_job)
father_job_index = father_job_index.apply(perc).reindex(index)
father_job_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title("Grade vs Father's Job", fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Mother's Education

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
mother_education_index = pd.crosstab(index=student_df.final_grade, columns=student_df.mother_education)
mother_education_index = mother_education_index.apply(perc).reindex(index)
mother_education_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title("Grade vs Mother's Education", fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Father's Education

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
father_education_index = pd.crosstab(index=student_df.final_grade, columns=student_df.father_education)
father_education_index = father_education_index.apply(perc).reindex(index)
father_education_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title("Grade vs Father's Education", fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Desire higher Education

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
desire_higher_edu_index = pd.crosstab(index=student_df.final_grade, columns=student_df.desire_higher_edu)
desire_higher_edu_index = desire_higher_edu_index.apply(perc).reindex(index)
desire_higher_edu_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title("Grade vs Desire Higher Education", fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Study time

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
study_time_index = pd.crosstab(index=student_df.final_grade, columns=student_df.study_time)
study_time_index = study_time_index.apply(perc).reindex(index)
study_time_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Study Time', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Internet Access

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
internet_index = pd.crosstab(index=student_df.final_grade, columns=student_df.internet)
internet_index = internet_index.apply(perc).reindex(index)
internet_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Internet Access', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by relationship status

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
relationship_index = pd.crosstab(index=student_df.final_grade, columns=student_df.romantic)
romantic_index = relationship_index.apply(perc).reindex(index)
romantic_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Relationship Status', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Family support status

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
family_support_index = pd.crosstab(index=student_df.final_grade, columns=student_df.family_support)
family_support_index = family_support_index.apply(perc).reindex(index)
family_support_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Family Support Status', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Parent cohabiting Status

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
parents_cohabitation_status_index = pd.crosstab(index=student_df.final_grade, columns=student_df.parents_cohabitation_status)
parents_cohabitation_status_index = parents_cohabitation_status_index.apply(perc).reindex(index)
parents_cohabitation_status_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Parents Cohabitation Status', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Age

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
age_index = pd.crosstab(index=student_df.final_grade, columns=student_df.age)
age_index = age_index.apply(perc).reindex(index)
age_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Age', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Failures

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
failures_index = pd.crosstab(index=student_df.final_grade, columns=student_df.failures)
failures_index = failures_index.apply(perc).reindex(index)
failures_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Failures', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Free Time

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
free_time_index = pd.crosstab(index=student_df.final_grade, columns=student_df.free_time)
free_time_index = free_time_index.apply(perc).reindex(index)
free_time_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Free Time', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Weekday alcohol consumption

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
weekday_alcohol_usage_index = pd.crosstab(index=student_df.final_grade, columns=student_df.weekday_alcohol_usage)
weekday_alcohol_usage_index = weekday_alcohol_usage_index.apply(perc).reindex(index)
weekday_alcohol_usage_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Weekday Alcohol Usage', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Weekend alcohol consumption

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
weekend_alcohol_usage_index = pd.crosstab(index=student_df.final_grade, columns=student_df.weekend_alcohol_usage)
weekend_alcohol_usage_index = weekend_alcohol_usage_index.apply(perc).reindex(index)
weekend_alcohol_usage_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Weekend Alcohol Usage', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by health

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
health_index = pd.crosstab(index=student_df.final_grade, columns=student_df.health)
health_index = health_index.apply(perc).reindex(index)
health_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Health', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Comparing the students' grades by Going out with friends

In [None]:
perc = (lambda col: col/col.sum())
index = ['Poor','Average','Excellent']
go_out_with_friends_index = pd.crosstab(index=student_df.final_grade, columns=student_df.go_out_with_friends)
go_out_with_friends_index = go_out_with_friends_index.apply(perc).reindex(index)
go_out_with_friends_index.plot.bar(fontsize=16, figsize=(14,8))
plt.title('Grade vs Frequency of Going Out With Friends', fontsize=20)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xlabel('Final Grade', fontsize=16)
plt.show()

#### Gender Distribution

In [None]:
student_df['sex'].unique()

In [None]:
f, ax = plt.subplots()
figure = sns.countplot(x = 'sex', data=student_df, order=['M','F'])
ax = ax.set(ylabel="Count", xlabel="gender")
figure.grid(False)
plt.title('Gender Distribution')

#### School Distribution

In [None]:
student_df['school'].unique()

In [None]:
f, ax = plt.subplots()
figure = sns.countplot(x = 'school', data=student_df, order=['GP','MS'])
ax = ax.set(ylabel="Count", xlabel="school")
figure.grid(False)
plt.title('School Distribution')

#### Address Distribution

In [None]:
student_df['address'].unique()

In [None]:
f, ax = plt.subplots()
figure = sns.countplot(x = 'address', data=student_df, order=['U','R'])
ax = ax.set(ylabel="Count", xlabel="address")
figure.grid(False)
plt.title('Address Distribution')

#### Age Distribution

In [None]:
student_df['age'].unique()

In [None]:
f, ax = plt.subplots()
figure = sns.countplot(x = 'age', data=student_df, order=[15,16,17,18,19,20,21,22])
ax = ax.set(ylabel="Count", xlabel="age")
figure.grid(False)
plt.title('Age Distribution')

#### Family Distribution

In [None]:
student_df['family_size'].unique()

In [None]:
f, ax = plt.subplots()
figure = sns.countplot(x = 'family_size', data=student_df, order=['GT3','LE3'])
ax = ax.set(ylabel="Count", xlabel="family_size")
figure.grid(False)
plt.title('Family Size Distribution')

#### From this we can tell that the distribution of the grades is decent and doesn't require any further skewness correction yet. We can go with this distribution for now to analyze the data and create a primitive model and it's error rate first. We can look into data processing of the final grade field afterwards if the results aren't satisfactory.

In [None]:
# only works on numerical variables
corr = df.corr() 
sns.heatmap(corr)

In [None]:
print (corr['final_score'].sort_values(ascending = False), '\n')

#### From the correlation graph above, we can look at the numerical fields to know the values that affect the end result the most. Obviously period_1_score and period_2_score are the most correlated fields to final score as they are part of the calculation formula for final score so they will have the greatest effect on our prediction. Another thing we can see is the negative correlation between failures and the final score result. This also makes quite a lot of sense as more failures tend to negatively affect your end score. Absences and free time seem to not be very relevant in the dataset that are analyzing which can be a flag that may help us further understand the data in the future.

#### Now that we have analyzed the numerical data slightly and figured out the most correlated fields, we now have to take a look at the categorical data to figure out how useful the fields may be and how to introduce them into the prediction model. The simplest way to analyze those fields is to compare the means accross the categories.

In [None]:
groupColumns = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup'
               , 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

avgColumns = ['G3', 'G2', 'G1']

In [None]:
school = data.groupby(groupColumns[0])[avgColumns].mean()
school.head()

#### From this, we see that Gabriel Pereira students generally do better than Mousinho da Silveira students. The same analysis can be done for a few more fields:

In [None]:
sex = data.groupby(groupColumns[1])[avgColumns].mean()
sex.head()

#### Correlation between Attributes
##### The heatmap shows the correlation between different attributes. We can use it to find which attributes are highle correlated with the target label and select them whereas we can also drop the features that are highly correlated to other features

In [None]:
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(15,12))

ax = sns.heatmap(data=df.corr(), ax=ax, cmap="Blues")
ax.set_xlabel('Features',fontdict={"fontsize":16})
ax.set_ylabel('Features',fontdict={"fontsize":16})
ax.set_title('Correlation between different Features', loc="center", fontdict={"fontsize": 16, "fontweight":"bold"})

plt.savefig("heatmap.png", bbox_inches="tight")
plt.show()

##### From the above heatmap, columns G1, G2 and G3 are highly correlated to each other. The below plots show this correlation.

In [None]:
pairplot = sns.pairplot(dataset[["G1", "G2", "G3"]], palette="viridis")

plt.savefig("pairplot.png", bbox_inches="tight")
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,8))

ax[0] = sns.lineplot(x="G1", y="G3", data=dataset, palette="viridis", ax=ax[0])
ax[0].set_xlabel('G1',fontdict={"fontsize":16})
ax[0].set_ylabel('G3',fontdict={"fontsize":16})
ax[0].set_title('G3 vs G1', loc="center", fontdict={"fontsize": 16, "fontweight":"bold"})

ax[1] = sns.lineplot(x="G2", y="G3", data=dataset, palette="viridis", ax=ax[1])
ax[1].set_xlabel('G2',fontdict={"fontsize":16})
ax[1].set_ylabel('G3',fontdict={"fontsize":16})
ax[1].set_title('G3 vs G2', loc="center", fontdict={"fontsize": 16, "fontweight":"bold"})

plt.savefig("lineplot.png", bbox_inches="tight")
plt.show()

### analysing 'finale grade'

In [None]:
#describe "final grade"
df_train['G3'].describe()

In [None]:
#Distribution of Final grade of students
demo= sns.countplot(df_train['G3'])
demo.axes.set_title('Distribution of Final grade of students', fontsize = 35)
demo.set_xlabel('Final Grade', fontsize = 20)
demo.set_ylabel('Count', fontsize = 20)
plt.show()

### Apart from the high number of students scoring 0, the distribution is normal as expected.Maybe the value 0 is used in place of null. Or maybe the students who did not appear for the exam, or were not allowed to sit for the exam due to some reason are marked as 0. We cannot be sure

In [None]:
#Serialized representation
import pickle
model = 'LinearRegressionModel.sav'
pickle.dump(regressor, open(model, 'wb'))