In [None]:
%autosave 0
from IPython.core.display import HTML, display
display(HTML('<style>.container { width:100% !important; } </style>'))

# Logistic Regression with SciKit-Learn

In [None]:
import numpy  as np
import pandas as pd

The data we want to investigate is stored in the file `'exam.csv'`.  The first column of this file is an integer from the set $\{0,1\}$.  The nuber is $0$ if the corresponding student has failed the exam and is $1$ otherwise.  The second column is a floating point number that lists the number of hours that the student has studied.

In [None]:
with open('exam.csv') as file:
    reader = csv.reader(file, delimiter=',', skipinitialspace=True)
    count  = 0  # line count
    Pass   = []
    Hours  = []
    for row in reader:
        if count != 0:  # skip header
            Pass .append(float(row[0]))
            Hours.append(float(row[1]))
        count += 1

To proceed, we will plot the data points.  To this end we transform the lists `Pass` and `Hours` into numpy arrays.

In [None]:
y = np.array(Pass)
x = np.array(Hours)

In [None]:
import matplotlib.pyplot as plt
import seaborn           as sns

In [None]:
plt.figure(figsize=(15, 10))
sns.set(style='darkgrid')
plt.title('Pass/Fail vs. Hours of Study')
plt.axvline(x=0.0, c='k')
plt.axhline(y=0.0, c='k')
plt.xlabel('Hours of Study')
plt.ylabel('Pass = 1, Fail = 0')
plt.yticks(np.arange(-0.0, 1.0, step=0.1))
plt.scatter(x, y, color='b')

The number of students is stored in the variable `n`.

In [None]:
n = len(y)
n

We have to turn the vector `x` into the feature matrix `X`.

In [None]:
X = np.reshape(x, (n,1))
X

We prepend the number $1.0$ in every row of `X`.

In [None]:
X = np.append(np.ones((n, 1)), X, axis=-1)
X

Currently, the entries in the vector `y` are either $0$ or $1$.  These values need to be transformed to $-1$ and $+1$. 

In [None]:
y = 2 * y - 1
y

As we have no real clue about the weights, we set them to $0$ initially.

In [None]:
import gradient_ascent

In [None]:
start   = np.zeros((2,))
eps     = 10 ** -8
f       = lambda w: ll(X, y, w)
gradF   = lambda w: gradLL(X, y, w)
w, _, _ = gradient_ascent.findMaximum(f, gradF, start, eps)
beta    = w[0]
gamma   = w[1]
print(f'model: P(pass|hours) = S({beta} + {gamma} * hours)')

Let us plot this function together with the data.

In [None]:
plt.figure(figsize=(15, 9))
sns.set(style='darkgrid')
plt.title('Pass/Fail vs. Hours of Study')
H = np.arange(0.0, 6.0, 0.05)
P = sigmoid(beta + gamma * H)
sns.lineplot(H, P, color='r')
plt.axvline(x=0.0, c='k')
plt.axhline(y=0.0, c='k')
plt.xlabel('Hours of Study')
plt.ylabel('Probability of Passing the Exam')
plt.yticks(np.arange(-0.0, 1.01, step=0.1))
plt.scatter(x, (y + 1) / 2, color='b')
plt.savefig('exam-probability.pdf')