In [73]:
# importing required libraries
import pandas as pd  
import numpy as np  
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [74]:
df = pd.read_csv("../data/student-merged.csv")

In [75]:
df.shape

(672, 30)

In [76]:
df.describe()

Unnamed: 0,age,traveltime,studytime,failures,famrel,freetime,health,absences,G1,G2,G3,Pedu,alc
count,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0
mean,16.8125,1.566964,1.928571,0.33631,3.938988,3.191964,3.534226,4.875,10.715774,10.690476,10.684524,2.449405,1.938988
std,1.264189,0.745565,0.824884,0.722219,0.939685,0.961429,1.435765,6.823897,3.082742,3.538508,4.144398,1.050356,0.992144
min,15.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0
25%,16.0,1.0,1.0,0.0,4.0,2.0,2.0,0.0,8.0,9.0,9.0,2.0,1.0
50%,17.0,1.0,2.0,0.0,4.0,3.0,4.0,2.0,10.0,11.0,11.0,2.0,2.0
75%,18.0,2.0,2.0,0.0,5.0,4.0,5.0,7.25,13.0,13.0,13.0,3.0,2.0
max,22.0,4.0,4.0,3.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0,4.0,5.0


In [77]:
# Data preprocessing - convert all classified attributes to numeric
for index_label, row_series in df.iterrows():

    if row_series['school'] == 'GP':
        df.at[index_label , 'school'] = 0
    elif row_series['school'] == 'MS':
        df.at[index_label , 'school'] = 1

    if row_series['sex'] == 'F':
        df.at[index_label , 'sex'] = 0
    elif row_series['sex'] == 'M':
        df.at[index_label , 'sex'] = 1

    if row_series['address'] == 'U':
        df.at[index_label , 'address'] = 0
    elif row_series['address'] == 'R':
        df.at[index_label , 'address'] = 1

    if row_series['famsize'] == 'GT3':
        df.at[index_label , 'famsize'] = 0
    elif row_series['famsize'] == 'LE3':
        df.at[index_label , 'famsize'] = 1

    if row_series['Pstatus'] == 'A':
        df.at[index_label , 'Pstatus'] = 0
    elif row_series['Pstatus'] == 'T':
        df.at[index_label , 'Pstatus'] = 1

    if row_series['Mjob'] == 'at_home':
        df.at[index_label , 'Mjob'] = 0
    elif row_series['Mjob'] == 'health':
        df.at[index_label , 'Mjob'] = 1
    elif row_series['Mjob'] == 'services':
        df.at[index_label , 'Mjob'] = 2
    elif row_series['Mjob'] == 'teacher':
        df.at[index_label , 'Mjob'] = 3
    elif row_series['Mjob'] == 'other':
        df.at[index_label , 'Mjob'] = 4

    if row_series['Fjob'] == 'at_home':
        df.at[index_label , 'Fjob'] = 0
    elif row_series['Fjob'] == 'health':
        df.at[index_label , 'Fjob'] = 1
    elif row_series['Fjob'] == 'services':
        df.at[index_label , 'Fjob'] = 2
    elif row_series['Fjob'] == 'teacher':
        df.at[index_label , 'Fjob'] = 3
    elif row_series['Fjob'] == 'other':
        df.at[index_label , 'Fjob'] = 4

    if row_series['reason'] == 'home':
        df.at[index_label , 'reason'] = 0
    elif row_series['reason'] == 'reputation':
        df.at[index_label , 'reason'] = 1
    elif row_series['reason'] == 'course':
        df.at[index_label , 'reason'] = 2
    elif row_series['reason'] == 'other':
        df.at[index_label , 'reason'] = 3

    if row_series['guardian'] == 'mother':
        df.at[index_label , 'guardian'] = 0
    elif row_series['guardian'] == 'father':
        df.at[index_label , 'guardian'] = 1
    elif row_series['guardian'] == 'other':
        df.at[index_label , 'guardian'] = 2

    if row_series['schoolsup'] == 'no':
        df.at[index_label , 'schoolsup'] = 0
    elif row_series['schoolsup'] == 'yes':
        df.at[index_label , 'schoolsup'] = 1

    if row_series['famsup'] == 'no':
        df.at[index_label , 'famsup'] = 0
    elif row_series['famsup'] == 'yes':
        df.at[index_label , 'famsup'] = 1

    if row_series['paid'] == 'no':
        df.at[index_label , 'paid'] = 0
    elif row_series['paid'] == 'yes':
        df.at[index_label , 'paid'] = 1

    if row_series['activities'] == 'no':
        df.at[index_label , 'activities'] = 0
    elif row_series['activities'] == 'yes':
        df.at[index_label , 'activities'] = 1

    if row_series['nursery'] == 'no':
        df.at[index_label , 'nursery'] = 0
    elif row_series['nursery'] == 'yes':
        df.at[index_label , 'nursery'] = 1

    if row_series['higher'] == 'no':
        df.at[index_label , 'higher'] = 0
    elif row_series['higher'] == 'yes':
        df.at[index_label , 'higher'] = 1

    if row_series['internet'] == 'no':
        df.at[index_label , 'internet'] = 0
    elif row_series['internet'] == 'yes':
        df.at[index_label , 'internet'] = 1

    if row_series['romantic'] == 'no':
        df.at[index_label , 'romantic'] = 0
    elif row_series['romantic'] == 'yes':
        df.at[index_label , 'romantic'] = 1

df.head()
# df.to_numpy()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,health,absences,G1,G2,G3,Pedu,alc
0,0,0,18,0,0,0,0,3,2,0,2,2,0,1,0,0,0,1,1,0,0,4,4,3,6,5,6,6,4,1
1,0,0,17,0,0,1,0,4,2,1,1,2,0,0,1,0,0,0,1,1,0,5,3,3,4,5,5,6,1,1
2,0,0,15,0,1,1,0,4,3,0,1,2,3,1,0,1,0,1,1,1,0,4,2,3,10,7,8,10,1,2
3,0,0,15,0,0,1,1,2,0,0,1,3,0,0,1,1,1,1,1,1,1,3,2,5,2,15,14,15,3,1
4,0,0,16,0,0,1,4,4,0,1,1,2,0,0,1,1,0,1,1,0,0,4,2,5,4,6,10,10,3,2


In [78]:
#Define variables X and Y
X = df.drop('G3', axis='columns')
y = df['G3']
y.head()

0     6
1     6
2    10
3    15
4    10
Name: G3, dtype: int64

In [79]:
#Split Data 80-20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [80]:
#train the algorithm
lnr = LinearRegression()  
lnr.fit(X_train, y_train) 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [81]:
#Find the intercept:
print('\nb=',lnr.intercept_)
#find the slope:
print('\na=',lnr.coef_)


b= -1.3422111311583311

a= [ 0.35887724  0.01331654 -0.03199873 -0.11072316  0.17638956 -0.10011708
  0.04717198  0.00924762  0.09604736 -0.07417629  0.22618526 -0.13872062
 -0.13500358  0.29310014  0.18651396 -0.12608928 -0.32537586 -0.18511271
 -0.01559576 -0.24802096 -0.11642806  0.06418092  0.08441478  0.01022808
  0.033178    0.16335347  0.95886555 -0.00678052 -0.03694797]


In [82]:
y_pred = lnr.predict(X_test)

In [83]:
print(type(y_pred))
print(type(y_test.to_numpy()))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [84]:
y_pred = y_pred.round()
y_pred = y_pred.astype(int)

In [85]:

y_test[:30]

582     8
310     0
14     16
642     8
76     10
165    12
498    11
468    16
651    18
401    12
222    17
306    18
367     0
548    10
631     8
75     10
501    12
649    13
181    12
597    10
460     9
285    11
312    11
412     8
400    13
303    18
162     0
414    13
402    11
482    14
Name: G3, dtype: int64

In [86]:
#Compare the actual output values with the predicted values
df = pd.DataFrame({'Actual': y_test.to_numpy().flatten(), 'Predicted': y_pred.flatten()})
df

Unnamed: 0,Actual,Predicted
0,8,8
1,0,8
2,16,16
3,8,7
4,10,11
5,12,12
6,11,9
7,16,16
8,18,19
9,12,11


In [87]:
#How good our algorithm is?
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

Mean Absolute Error: 0.9703703703703703


In [88]:
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  

Mean Squared Error: 2.4518518518518517


In [89]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Root Mean Squared Error: 1.5658390248846947


In [90]:
correct_predictions = total_counts = 0
for index_label, row_series in df.iterrows():
    # print(index_label)
    # print(row_series['Actual']==row_series['Predicted'])
    if(row_series['Actual']==row_series['Predicted']):
        correct_predictions += 1
    total_counts += 1

print(correct_predictions/total_counts)

0.3925925925925926
