In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('student_pass.csv')

## Data Preparation

In [3]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
Hours_Studied,6.39,0.25,2.23,6.77,8.92,0.87,0.3,5.05,0.27,6.5,...,1.63,9.9,6.4,5.57,6.85,8.43,7.76,2.29,3.15,2.11
Pass,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [4]:
df.columns = df.columns.str.lower()

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
hours_studied,6.39,0.25,2.23,6.77,8.92
pass,1.0,0.0,1.0,1.0,1.0


In [6]:
df.dtypes

hours_studied    float64
pass               int64
dtype: object

In [7]:
df.isnull().sum()

hours_studied    0
pass             0
dtype: int64

In [8]:
df_sz = df.shape[0]
df_sz

50

Since the dataset size is rather small, we will not split the data.

In [9]:
X = pd.DataFrame(df['hours_studied'])
y = df['pass']

## Linear Regression using ScikitLearn

In [2]:
from sklearn.linear_model import LinearRegression

In [3]:
model = LinearRegression().fit()

[1;31mInit signature:[0m
[0mLinearRegression[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mfit_intercept[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mcopy_X[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mpositive[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Ordinary least squares Linear Regression.

LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
to minimize the residual sum of squares between the observed targets in
the dataset, and the targets predicted by the linear approximation.

Parameters
----------
fit_intercept : bool, default=True
    Whether to calculate the intercept for this model. If set
    to False, no intercept will be used in calculations
    (i.e. data is expected to be centered).

copy_X : bool, default=True
    If True, X will be copie

---

## Logistic Regression using ScikitLearn

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
model = LogisticRegression(max_iter=1000, solver='liblinear')

In [12]:
model.fit(X, y)

---

## Model Testing

Take 10 random samples from the dataset for model testing.

In [13]:
np.random.seed(0)
random_idx = np.random.randint(df_sz, size=10) 

In [14]:
random_idx

array([44, 47,  0,  3,  3, 39,  9, 19, 21, 36])

In [15]:
random_sample = df.iloc[random_idx]

In [16]:
random_sample.T

Unnamed: 0,44,47,0,3,3.1,39,9,19,21,36
hours_studied,6.85,2.29,6.39,6.77,6.77,1.71,6.5,6.04,7.3,9.37
pass,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
X_rand = pd.DataFrame(df['hours_studied'])
y_rand = df['pass']

In [18]:
# probability of passing
y_pred = model.predict_proba(X_rand)[:, 1]

In [19]:
pass_decision = y_pred > 0.5 #threshold
pass_decision = pass_decision.astype(int)

Compare the model prediction to the actual values.

In [20]:
(y_rand == pass_decision).mean()

0.76

Our model has 76% accuracy. But this is not a strict indicator of the model's robustness, as we have only little data to work with.