In [None]:
from IPython.core.display import HTML, display
display(HTML('<style>.container { width:100% !important; } </style>'))

# Polynomial Logistic Regression

In [None]:
import numpy  as np
import pandas as pd

The data we want to investigate is stored in the file `'fake-data.csv'`.   It is data that I have found somewhere.  I am not sure whether this data is real or fake.  Therefore, I won't discuss the attributes of the data.  The point of the data is that it is a classification problem that can not be solved with 
ordinary logistic regression.  We will introduce <em style="color:blue;">polynomial logistic regression</em> to solve this problem.

In [None]:
DF = pd.read_csv('fake-data.csv')
DF.head()

In [None]:
DF.describe()

We extract the features from the data frame and convert it into a `NumPy` <em style="color:blue;">feature matrix</em>.

In [None]:
X = np.array(DF[['x','y']])

We extract the target column and convert it into a `NumPy` array.

In [None]:
Y = np.array(DF['class'])

In order to plot the instances according to their class we divide the feature matrix $X$ into two parts. $\texttt{X_pass}$ contains those examples that have class $1$, while $\texttt{X_fail}$ contains those examples that have class $0$.

In [None]:
X_pass = np.array(DF.loc[Y == 1.0])
X_fail = np.array(DF.loc[Y == 0.0])

Let us plot the data.

In [None]:
import matplotlib.pyplot as plt
import seaborn           as sns

In [None]:
plt.figure(figsize=(15, 10))
sns.set(style='darkgrid')
plt.title('A Classification Problem')
plt.axvline(x=0.0, c='k')
plt.axhline(y=0.0, c='k')
plt.xlabel('x axis')
plt.ylabel('y axis')
plt.xticks(np.arange(-0.9, 1.1, step=0.1))
plt.yticks(np.arange(-0.8, 1.2, step=0.1))
plt.scatter(X_pass[:,0], X_pass[:,1], color='b') 
plt.scatter(X_fail[:,0], X_fail[:,1], color='r') 

We want to split the data into a <em style="color:blue;">training set</em> and a <em style="color:blue;">test set</em>.
The <em style="color:blue;">training set</em> will be used to compute the parameters of our model, while the
<em style="color:blue;">testing set</em> is only used to check the *accuracy*.  SciKit-Learn has a predefined method
`sklearn.model_selection import train_test_split` that can be used to randomly split data into a training set and a test set.

In [None]:
from sklearn.model_selection import train_test_split

We will split the data at a ratio of $4:1$, i.e. $80\%$ of the data will be used for training, while the remaining $20\%$ is used to test the accuracy.

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In order to build a <em style="color:blue;">logistic regression</em> classifier, we import the module `linear_model` from SciKit-Learn.

In [None]:
import sklearn.linear_model as lm

The function $\texttt{logistic_regression}(\texttt{X_train}, \texttt{Y_train}, \texttt{X_test}, \texttt{Y_test})$ takes a feature matrix $\texttt{X_train}$ and a corresponding vector $\texttt{Y_train}$ and computes a logistic regression model $M$ that best fits these data.  Then, the accuracy of the model is computed using the test data $\texttt{X_test}$ and $\texttt{Y_test}$.

In [None]:
def logistic_regression(X_train, Y_train, X_test, Y_test):
    model       = lm.LogisticRegression(C=10000, tol=1e-6, solver='newton-cg')
    M           = model.fit(X_train, Y_train)
    train_score = M.score(X_train, Y_train)
    yPredict    = M.predict(X_test)
    accuracy    = np.sum(yPredict == Y_test) / len(Y_test)
    return M, train_score, accuracy

We use this function to build a model for our data.

In [None]:
M, score, accuracy = logistic_regression(X_train, Y_train, X_test, Y_test)
score, accuracy

Given that there are only two classes, the accuracy of our first model is quite poor.  
Let us extract the coefficients so we can plot the <em style="color:blue;">decision boundary</em>.

In [None]:
ϑ0     = M.intercept_
ϑ1, ϑ2 = M.coef_[0]

In [None]:
plt.figure(figsize=(15, 10))
sns.set(style='darkgrid')
plt.title('A Classification Problem')
plt.axvline(x=0.0, c='k')
plt.axhline(y=0.0, c='k')
plt.xlabel('x axis')
plt.ylabel('y axis')
plt.xticks(np.arange(-0.9, 1.1, step=0.1))
plt.yticks(np.arange(-0.8, 1.2, step=0.1))
plt.scatter(X_pass[:,0], X_pass[:,1], color='b') 
plt.scatter(X_fail[:,0], X_fail[:,1], color='r') 
H = np.arange(0.45, 0.9, 0.05)
P = -(ϑ0 + ϑ1 * H)/ϑ2
plt.plot(H, P, color='green')

Clearly, pure <em style="color:blue;">logistic regression</em> is not working for this example.  The reason is, that a linear decision boundary is not 
able to separate the positive examples from the negative examples.  Let us add <em style="color:blue;">polynomial features</em>.  This enables us to create 
more complex decision boundaries.

The function $\texttt{extend}(X)$ takes a feature matrix $X$ that is supposed to contain two features $x$ and $y$.  It creates the new features $x^2$, $y^2$ and $x\cdot y$ and returns a new feature matrix that also contains these additional features.

In [None]:
def extend(X):
    n  = len(X)
    fx = np.reshape(X[:,0], (n, 1))
    fy = np.reshape(X[:,1], (n, 1))
    return np.hstack([fx, fy, fx*fx, fy*fy, fx*fy])

In [None]:
X_train_quadratic = extend(X_train)
X_test_quadratic  = extend(X_test)

In [None]:
M, score, accuracy = logistic_regression(X_train_quadratic, Y_train, X_test_quadratic, Y_test)
score, accuracy

This seems to work better.  Let us compute the decision boundary and plot it.

In [None]:
ϑ0                 = M.intercept_
ϑ1, ϑ2, ϑ3, ϑ4, ϑ5 = M.coef_[0]

The decision boundary is now given by the following equation:
$$ \vartheta_0 + \vartheta_1 \cdot x + \vartheta_2 \cdot y + \vartheta_3 \cdot x^2 + \vartheta_4 \cdot y^2 + \vartheta_5 \cdot x \cdot y = 0$$
This is the equation of an ellipse.  Let us plot the <em style="color:blue;">decision boundary</em> with the data.

In [None]:
a    = np.arange(-1.0, 1.0, 0.005)
b    = np.arange(-1.0, 1.0, 0.005)
A, B = np.meshgrid(a,b)
Z    = ϑ0 + ϑ1 * A + ϑ2 * B + ϑ3 * A * A + ϑ4 * B * B + ϑ5 * A * B 

In [None]:
plt.figure(figsize=(15, 10))
sns.set(style='darkgrid')
plt.title('A Classification Problem')
plt.axvline(x=0.0, c='k')
plt.axhline(y=0.0, c='k')
plt.xlabel('x axis')
plt.ylabel('y axis')
plt.xticks(np.arange(-0.9, 1.1, step=0.1))
plt.yticks(np.arange(-0.8, 1.2, step=0.1))
plt.scatter(X_pass[:,0], X_pass[:,1], color='b') 
plt.scatter(X_fail[:,0], X_fail[:,1], color='r') 
CS = plt.contour(A, B, Z, 0, colors='green')

Let us try to add <em style="color:blue;">quartic features</em> next.  These are features like $x^4$, $x^2\cdot y^2$, etc.
Luckily, SciKit-Learn has function that can automize this process.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
quartic = PolynomialFeatures(4, include_bias=False)
X_train_quartic = quartic.fit_transform(X_train)
X_test_quartic  = quartic.fit_transform(X_test)
print(quartic.get_feature_names(['x', 'y']))

Let us fit the quartic model.

In [None]:
M, score, accuracy = logistic_regression(X_train_quartic, Y_train, X_test_quartic, Y_test)
score, accuracy

In [None]:
ϑ0 = M.intercept_
ϑ1, ϑ2, ϑ3, ϑ4, ϑ5, ϑ6, ϑ7, ϑ8, ϑ9, ϑ10, ϑ11, ϑ12, ϑ13, ϑ14 = M.coef_[0]

In [None]:
a    = np.arange(-1.0, 1.0, 0.005)
b    = np.arange(-1.0, 1.0, 0.005)
A, B = np.meshgrid(a,b)
Z    = ϑ0 + ϑ1 * A + ϑ2 * B + ϑ3 * A**2 + ϑ4 * A * B + ϑ5 * B**2 + ϑ6 * A**3 + ϑ7 * A**2 * B + ϑ8 * A * B**2 + ϑ9 * B**3 + \
       ϑ10 * A**4 + ϑ11 * A**3 * B + ϑ12 * A**2 * B**2 + ϑ13 * A * B**3 + ϑ14 * B**4 

In [None]:
plt.figure(figsize=(15, 10))
sns.set(style='darkgrid')
plt.title('A Classification Problem')
plt.axvline(x=0.0, c='k')
plt.axhline(y=0.0, c='k')
plt.xlabel('x axis')
plt.ylabel('y axis')
plt.xticks(np.arange(-0.9, 1.1, step=0.1))
plt.yticks(np.arange(-0.8, 1.2, step=0.1))
plt.scatter(X_pass[:,0], X_pass[:,1], color='b') 
plt.scatter(X_fail[:,0], X_fail[:,1], color='r') 
CS = plt.contour(A, B, Z, 0, colors='green')

Lets get bold and try feature up to the sixth power.

In [None]:
sextic         = PolynomialFeatures(6, include_bias=False)
X_train_sextic = sextic.fit_transform(X_train)
X_test_sextic  = sextic.fit_transform(X_test)
print(sextic.get_feature_names(['x', 'y']))
len(sextic.get_feature_names(['x', 'y']))

In [None]:
M, score, accuracy = logistic_regression(X_train_sextic, Y_train, X_test_sextic, Y_test)
score, accuracy

In [None]:
ϑ0 = M.intercept_
ϑ1, ϑ2, ϑ3, ϑ4, ϑ5, ϑ6, ϑ7, ϑ8, ϑ9, ϑ10, ϑ11, ϑ12, ϑ13, ϑ14, ϑ15, ϑ16, ϑ17, ϑ18, ϑ19, ϑ20, ϑ21, ϑ22, ϑ23, ϑ24, ϑ25, ϑ26, ϑ27 = M.coef_[0]

In [None]:
a    = np.arange(-1.0, 1.0, 0.005)
b    = np.arange(-1.0, 1.3, 0.005)
A, B = np.meshgrid(a,b)
Z    = ϑ0 + \
       ϑ1  * A    + ϑ2  * B + \
       ϑ3  * A**2 + ϑ4  * A    * B + ϑ5  * B**2 + \
       ϑ6  * A**3 + ϑ7  * A**2 * B + ϑ8  * A    * B**2 + ϑ9  * B**3 + \
       ϑ10 * A**4 + ϑ11 * A**3 * B + ϑ12 * A**2 * B**2 + ϑ13 * A * B**3    + ϑ14 * B**4 + \
       ϑ15 * A**5 + ϑ16 * A**4 * B + ϑ17 * A**3 * B**2 + ϑ18 * A**2 * B**3 + ϑ19 * A * B**4    + ϑ20 * B**5 + \
       ϑ21 * A**6 + ϑ22 * A**5 * B + ϑ23 * A**4 * B**2 + ϑ24 * A**3 * B**3 + ϑ25 * A**2 * B**4 + ϑ26 * A * B**5 + ϑ27 * B**6

In [None]:
X_pass_train = X_train[Y_train == 1.0]
X_fail_train = X_train[Y_train == 0.0]

In [None]:
plt.figure(figsize=(15, 10))
sns.set(style='darkgrid')
plt.title('A Classification Problem')
plt.axvline(x=0.0, c='k')
plt.axhline(y=0.0, c='k')
plt.xlabel('x axis')
plt.ylabel('y axis')
plt.xticks(np.arange(-0.9, 1.11, step=0.1))
plt.yticks(np.arange(-0.8, 1.21, step=0.1))
plt.scatter(X_pass_train[:,0], X_pass_train[:,1], color='b') 
plt.scatter(X_fail_train[:,0], X_fail_train[:,1], color='r') 
CS = plt.contour(A, B, Z, 0, colors='green')

In [None]:
octic         = PolynomialFeatures(8, include_bias=False)
X_train_octic = octic.fit_transform(X_train)
X_test_octic  = octic.fit_transform(X_test)

In [None]:
M, score, accuracy = logistic_regression(X_train_octic, Y_train, X_test_octic, Y_test)
score, accuracy

In [None]:
Θ = [M.intercept_] + list(M.coef_[0])

In [None]:
a    = np.arange(-1.0, 1.0, 0.005)
b    = np.arange(-1.0, 1.3, 0.005)
A, B = np.meshgrid(a,b)
Z    = Θ[0] + \
       Θ[1]  * A    + Θ[2]  * B + \
       Θ[3]  * A**2 + Θ[4]  * A    * B + Θ[5]  *        B**2 + \
       Θ[6]  * A**3 + Θ[7]  * A**2 * B + Θ[8]  * A    * B**2 + Θ[9]  *        B**3 + \
       Θ[10] * A**4 + Θ[11] * A**3 * B + Θ[12] * A**2 * B**2 + Θ[13] * A    * B**3 + Θ[14] *        B**4 + \
       Θ[15] * A**5 + Θ[16] * A**4 * B + Θ[17] * A**3 * B**2 + Θ[18] * A**2 * B**3 + Θ[19] * A    * B**4 + Θ[20] *        B**5 + \
       Θ[21] * A**6 + Θ[22] * A**5 * B + Θ[23] * A**4 * B**2 + Θ[24] * A**3 * B**3 + Θ[25] * A**2 * B**4 + Θ[26] * A    * B**5 + Θ[27] *        B**6 + \
       Θ[28] * A**7 + Θ[29] * A**6 * B + Θ[30] * A**5 * B**2 + Θ[31] * A**4 * B**3 + Θ[32] * A**3 * B**4 + Θ[33] * A**2 * B**5 + Θ[34] * A    * B**6 + Θ[35] *     B**7 + \
       Θ[36] * A**8 + Θ[37] * A**7 * B + Θ[38] * A**6 * B**2 + Θ[39] * A**5 * B**3 + Θ[40] * A**4 * B**4 + Θ[41] * A**3 * B**5 + Θ[42] * A**2 * B**6 + Θ[43] * A * B**7 + \
       Θ[44] * B**8

$\texttt{polynomial}(n)$ creates a polynomial in the variables `A` and `B` that contains all terms of the form $\Theta[k] \cdot A^i \cdot B^j$ where $i+j \leq n$.

In [None]:
Z = polynomial_grid(8, M)

In [None]:
plt.figure(figsize=(15, 10))
sns.set(style='darkgrid')
plt.title('A Classification Problem')
plt.axvline(x=0.0, c='k')
plt.axhline(y=0.0, c='k')
plt.xlabel('x axis')
plt.ylabel('y axis')
plt.xticks(np.arange(-0.9, 1.11, step=0.1))
plt.yticks(np.arange(-0.8, 1.21, step=0.1))
plt.scatter(X_pass_train[:,0], X_pass_train[:,1], color='b') 
plt.scatter(X_fail_train[:,0], X_fail_train[:,1], color='r') 
CS = plt.contour(A, B, Z, 0, colors='green')

In [None]:
def polynomial(n):
    sum = 'Θ[0]' 
    cnt = 0
    for k in range(1, n+1):
        for i in range(0, k+1):
            cnt += 1
            sum += f' + Θ[{cnt}] * A**{k-i} * B**{i}'
    print('number of features:', cnt)
    return sum

In [None]:
def polynomial_grid(n, M):
    Θ    = [M.intercept_] + list(M.coef_[0])
    a    = np.arange(-1.0, 1.0, 0.005)
    b    = np.arange(-1.0, 1.3, 0.005)
    A, B = np.meshgrid(a,b)
    return eval(polynomial(n))

In [None]:
def plot_nth_degree_boundary(n):
    poly         = PolynomialFeatures(n, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly  = poly.fit_transform(X_test)
    M, score, accuracy = logistic_regression(X_train_poly, Y_train, X_test_poly, Y_test)
    print('The accuracy on the training set is:', score)
    print('The accuracy on the test     set is:', accuracy)
    Z = polynomial_grid(n, M)
    plt.figure(figsize=(15, 10))
    sns.set(style='darkgrid')
    plt.title('A Classification Problem')
    plt.axvline(x=0.0, c='k')
    plt.axhline(y=0.0, c='k')
    plt.xlabel('x axis')
    plt.ylabel('y axis')
    plt.xticks(np.arange(-0.9, 1.11, step=0.1))
    plt.yticks(np.arange(-0.8, 1.21, step=0.1))
    plt.scatter(X_pass_train[:,0], X_pass_train[:,1], color='b') 
    plt.scatter(X_fail_train[:,0], X_fail_train[:,1], color='r') 
    CS = plt.contour(A, B, Z, 0, colors='green')

In [None]:
plot_nth_degree_boundary(8)

In [None]:
plot_nth_degree_boundary(9)

In [None]:
%%time
plot_nth_degree_boundary(42)