In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os


In [3]:
csv_path = 'data/Student_Performance.csv'
data = pd.read_csv(csv_path)


In [4]:
data.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [6]:
data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [7]:
data["Extracurricular Activities"].value_counts()

Extracurricular Activities
No     5052
Yes    4948
Name: count, dtype: int64

In [8]:
data['Extracurricular Activities'] = data['Extracurricular Activities'].map({'Yes': 1, 'No': 0})

In [9]:
data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0


In [10]:
data["Extracurricular Activities"].value_counts()

Extracurricular Activities
0    5052
1    4948
Name: count, dtype: int64

In [11]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [14]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 9254 to 7270
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     8000 non-null   int64  
 1   Previous Scores                   8000 non-null   int64  
 2   Extracurricular Activities        8000 non-null   int64  
 3   Sleep Hours                       8000 non-null   int64  
 4   Sample Question Papers Practiced  8000 non-null   int64  
 5   Performance Index                 8000 non-null   float64
dtypes: float64(1), int64(5)
memory usage: 437.5 KB


In [15]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 6252 to 6929
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     2000 non-null   int64  
 1   Previous Scores                   2000 non-null   int64  
 2   Extracurricular Activities        2000 non-null   int64  
 3   Sleep Hours                       2000 non-null   int64  
 4   Sample Question Papers Practiced  2000 non-null   int64  
 5   Performance Index                 2000 non-null   float64
dtypes: float64(1), int64(5)
memory usage: 109.4 KB


In [18]:

# sepcify the features
indpdnt_vars = ['Hours Studied','Previous Scores','Extracurricular Activities','Sleep Hours','Sample Question Papers Practiced']

# specify the feature and label of train set
x_train = train_set[indpdnt_vars].to_numpy()
y_train = np.array(train_set['Performance Index'])

# specify the feature and label of test set
x_test = test_set[indpdnt_vars].to_numpy()
y_test = np.array(test_set['Performance Index'])

print("train metrix shape : ",x_train.shape)
print("outputs array shape",y_train.shape)

print("test metrix shape : ",x_test.shape)
print("outputs array shape",y_test.shape)

train metrix shape :  (8000, 5)
outputs array shape (8000,)
test metrix shape :  (2000, 5)
outputs array shape (2000,)


In [19]:

# scale function
def scale_data(x_train):
    x = x_train.copy()
    # initialize the standard scaler
    scaler = StandardScaler()

    # transform the data using the scaler
    x_scaled = scaler.fit_transform(x)
    return x_scaled

# linear model train function
def train_linear_model(x, y):
    # make linear model
    model = LinearRegression()

    # fit model to the data
    model.fit(x, y)

    return model

# polynomial model train function
def train_polynomial_model(x_train, y_train, deg):
    # make a polynomial with degree 3
    model_polynomial_features = PolynomialFeatures(degree=deg)

    # make a polynomial model with a pipeline
    model = make_pipeline(model_polynomial_features, LinearRegression())

    # fit data to the model
    model.fit(x_train, y_train)

    return model


## Training Linear model

In [20]:
model1 = train_linear_model(x_train, y_train)

print('Linear model score',r2_score(y_train, model1.predict(x_train)))
print('Linear model score on test set',r2_score(y_test, model1.predict(x_test)))

Linear model score 0.9886898790682355
Linear model score on test set 0.9889832909573145


## Training polynomial model

In [21]:
model2 = train_polynomial_model(x_train, y_train, 6)

print('polynomial model score',r2_score(y_train, model2.predict(x_train)))
print('polynomial model score on test set',r2_score(y_test, model2.predict(x_test)))


polynomial model score 0.98914115641452
polynomial model score on test set 0.9885505410648592


### Scaling the data

In [23]:
x_train_scaled = scale_data(x_train)
x_test_scaled = scale_data(x_test)

## Training Linear model on scaled data

In [24]:
model1 = train_linear_model(x_train_scaled, y_train)
print('Linear model score',r2_score(y_train, model1.predict(x_train_scaled)))
print('Linear model score on test set',r2_score(y_test, model1.predict(x_test_scaled)))


Linear model score 0.9886898790682355
Linear model score on test set 0.9884832861331777


## Training Polynomial model on scaled data

In [25]:
model2 = train_polynomial_model(x_train_scaled, y_train, 2)
print('polynomial model score',r2_score(y_train, model2.predict(x_train_scaled)))
print('polynomial model score',r2_score(y_test, model2.predict(x_test_scaled)))

polynomial model score 0.9887046501595589
polynomial model score -21977306.754903104
