# Kernel Logistic Regression

In [11]:
from typing import Tuple

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import seaborn as sns
import scipy

%matplotlib inline
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")

## Classifying Cardio

In [12]:
# quick look at data correlation
#data = pd.read_csv('../resources/cardio_train.csv', sep=';', index_col='id')
#temp_df = pd.concat([data.age, data.weight, data.cardio], axis=1, join='inner')

#print(temp_df)

#patient_has_cardio = 1
#temp_df["label"] = temp_df.apply(lambda row: 1 if (row.cardio == patient_has_cardio) else -1, axis=1)

#sns.lmplot(x='weight', y='age', hue='cardio', data=df, fit_reg=False)
# sns.lmplot(x='age', y='gender', hue='cardio', data=data, fit_reg=False).set(title='Age / Gender')
# sns.lmplot(x='age', y='weight', hue='cardio', data=data, fit_reg=False).set(title='Age / Weight')
# sns.lmplot(x='age', y='height', hue='cardio', data=data, fit_reg=False).set(title='Age / Height')
# sns.lmplot(x='age', y='cholesterol', hue='cardio', data=data, fit_reg=False).set(title='Age / Cholesterol')
# sns.lmplot(x='age', y='gluc', hue='cardio', data=data, fit_reg=False).set(title='Age / Glucose')
# sns.lmplot(x='age', y='alco', hue='cardio', data=data, fit_reg=False).set(title='Age / Alcohol')
# sns.lmplot(x='age', y='active', hue='cardio', data=data, fit_reg=False).set(title='Age / Active')


TODO mehr infos plot über daten

### Quick Data Inquiry Result
> The data is a mess and at the first look not linear separable. So a simple linear classification is not possible.
>
> A linear regression does not make sense with the result being a binary "class".

### Kernel / Feature function theory

Proceed by using introducing another dimension to make the data linear separable. This is done by using a feature function or kernel.

linar: $h(x) = w^T*x$ or nonlinar: $h(x) = w^T*\Phi(x)$

Feature Function $\Phi(x)$  based on Age = $A$ and Weight = $W$

$x = [A, W] ∈ R^2$

$ \Phi(x_2) = [1, A, W, A^2, W^2, AW] ∈ R^6$ - optional with ($\sqrt{2}$ for ease of calculation)


 With
 $ K(x, z) = \Phi(x)^T\Phi(z) = (x^T*z+1)^d$ with degree $d = 2$ for a two-dimensional input $x ∈ R^2$.

 $h(x) = w^T * \Phi(x_i)= w^T*K(x,z) = w^T * (x^T*z+1)^d $

$ min J(w) = \frac{1}{m} \sum_{i=1}^{m} l(h(x_i), y_i) + \Omega(w)$

Kernel can be used not just on age and weight but on all other features.

### LOAD DATA

In [13]:
print('-'*30); print("IMPORTING DATA");print('-'*30)
# limit dataset to 5000 instances for testing purposes (memory issues)
df = pd.read_csv('../resources/cardio_train.csv', sep=';', index_col='id')[:3000]
# df = pd.read_csv('../resources/cardio_train.csv', sep=';', index_col='id')

------------------------------
IMPORTING DATA
------------------------------


### Feature Scaling
Scale data so ....

In [14]:
import statistics
# TODO: outlier out , define vailid range, text explain

# data will be saved in extra file so we dont need to run this every time
feature_scale_flag = True

if feature_scale_flag:
    # Min/Max Scaling on I=[0,1]: x_scaled = (x - min(x)) / (max(x) - min(x))
    # df['age_scaled'] = ((df['age'])-min(df['age']))/(max(df['age'])-min(df['age']))
    # df['height_scaled'] = ((df['height'])-min(df['height']))/(max(df['height'])-min(df['height']))
    # df['weight_scaled'] = ((df['weight'])-min(df['weight']))/(max(df['weight'])-min(df['weight']))
    # Standardization: x_standardized = (x - µ) / sigma
    df['age_standardized'] = (df['age']-statistics.mean(df['age'])) / statistics.stdev(df['age'])
    df['height_standardized'] = (df['height']-statistics.mean(df['height'])) / statistics.stdev(df['height'])
    df['weight_standardized'] = (df['weight']-statistics.mean(df['weight'])) / statistics.stdev(df['weight'])

    # df['bmi'] = (df['weight'] / ((df['height'] / 100) ** 2)).round(decimals=2)
    # df['bmi_high'] = (df['bmi'] >= 30).astype(int)

    df['cardio'] = df['cardio'].apply(lambda t: 1 if t==1 else -1).values

    # eliminate corrupted Data
    # TODO: something is wrong here, as its write arrays instead of values, if fixed, its even fast enough to run it without extra save files
    # df['ap_lo_fixed'] = [df['ap_lo'] if 50 < x < 150 else -1 for x in df['ap_lo']]
    # df['ap_hi_fixed'] = [df['ap_hi'] if 100 < x < 190 else -1 for x in df['ap_hi']]
    # df.to_csv('../resources/feature_scaled_data3.csv', index=True)

## Implement functions

### squared exponential kernel $k(x,z)$
$k(x,z) = exp(− x^Tx−2x^Tz+z^Tz/ 2σ^2) = exp(sqdist(x,z)/2σ^2)$

### hypothesis function $h(x)$
$h_\alpha(x) = \alpha K = \sum_{j=1}^{m} \alpha_j k(x_j,x)$

### loss function $l(h(x),y)$

logistic loss:
$l_{logistic}(h_\alpha(x), y) = log(1 + e^{−y·h(x)})= log(1 + exp(−y · h_\alpha(x)))$

### $l_2$ regularizer

$r = \lambda l_2 = \lambda\alpha^{\intercal}K\alpha$


### objective function J

  kernlized logistic regression

reuslting in a regularized kernlized logistic:
$
J(\alpha) = \frac{1}{m}\sum_{i=1}^m  \log \big(1 + \exp\big(-y_i \cdot \sum_{j=1}^{m} \alpha_j k(x_j,x_i)\big) \big) + \lambda \alpha^{\intercal}K\alpha
$

In [15]:
def sqdist(X, Z):
    p1 = np.sum(X**2, axis=1)[:, np.newaxis]
    p2 = np.sum(Z**2, axis=1)
    p3 = -2 * np.dot(X, Z.T)
    return p1+p2+p3

def sq_exp(X, Z, sigma):
    return np.exp(-sqdist(X, Z)/(2*sigma**2) )


def J(α, X, y, sigma, lam):
    K = sq_exp(X, X, sigma)
    m = X.shape[0]
    total_loss = 0
    regularization = lam * np.dot(np.dot(np.transpose(α), K), α)

    for i in range(m):
        prediction = 0
        for j in range(m):
            prediction += α[j]*K[i][j]
        logistic_loss = np.log(1 + np.exp(-y[i] * prediction))
        total_loss += logistic_loss

    mean_loss = total_loss / m  + regularization
    return mean_loss

Implement the gradient of the regularized kernlized logistic regression objective.

In [16]:
def dJ(α, X, y, sigma, lam):
    K = sq_exp(X, X, sigma)
    m = X.shape[0]
    gradient = 0
    regularization = 2*lam * np.dot( K, α)

    for i in range(m):
        prediction = 0
        for j in range(m):
            prediction += α[j]*K[i][j]

        numerator = -y[i] * K[i]
        denominator = 1 + np.exp(y[i] * prediction)
        gradient += numerator / denominator

    mean_gradient = gradient / m + regularization
    return mean_gradient


## Train model

In [17]:
from scipy.optimize import minimize

def kernel_lr(X, y, sigma, lam):
    # implementation of kernel ridge regression using the scipy optimizer gradient descent
    α = np.zeros(X.shape[0],)
    α = minimize(J, α, args=(X, y, sigma, lam), jac=dJ, method='CG').x
    h = lambda Z: np.dot(α, sq_exp(X, Z, sigma))
    return h

## Split data in Train / Validation / Test

We'll first split our data into a Train set (70%) and Test set (30%).  
The training set will be further processed using 10-fold-cross-validation.

In [18]:
def train_test_split(data, train_proportion=0.7, shuffle=False):
    if shuffle:
        indices = np.random.permutation(data.shape[0])
    else:
        indices = np.arange(data.shape[0])

    split_index = int(train_proportion * data.shape[0])
    training_idx = indices[:split_index]
    test_idx = indices[split_index:]

    return data[training_idx, :], data[test_idx, :]


def cross_val(data, k=10):
    assert k >= 2
    datasets = []

    if data.shape[0] % k != 0:
        print("warning: this dataset contains {} entries and cannot be equally divided into {} chunks for cross-validation.".format(data.shape[0], k))
        print("Prutruding rows will be dropped.")
        data = data[ : (data.shape[0] // k) * k]

    for i in range(k):
        data_chunks = np.split(data, k)

        val_data = data_chunks.pop(i)
        train_data = np.concatenate(data_chunks)
        datasets.append((train_data, val_data))

    return datasets

def get_labels_and_features(dataset:np.ndarray)->Tuple[np.ndarray, np.ndarray]:
    """Return labels and features from a given dataset.
    :return: labels, features
    """
    # [0] age - [10] active
    # [11] cardio
    # [12] scaled age - [22]...
    raw_features, labels, features = np.hsplit(dataset, [11,12])
    return labels.flatten(), features


In [19]:
cardio_data = df.to_numpy()
train_data, test_data = train_test_split(cardio_data, shuffle=False)
datasets = cross_val(train_data, k=2)

# only use a single dataset for now
#train_set, val_set = datasets[0]
#y, X = get_labels_and_features(train_set)

## Evaluate minimized hypothesis function $h$

In [None]:
def score(h, X, y):
    predictions = h(X)
    #print(predictions)

    score = (predictions*y >= 0).astype(int)
    return score.sum()/score.shape[0]

def train_n_score(datasets, sigma, lam):
    train_accuracy = []
    val_accuracy = []
    for train_set, val_set in datasets:
        y_train, X_train = get_labels_and_features(train_set)
        y_val, X_val = get_labels_and_features(val_set)

        h = kernel_lr(X_train, y_train, sigma=sigma, lam=lam)

        train_accuracy.append(score(h, X_train, y_train))
        val_accuracy.append(score(h, X_val, y_val))

    print(f'Average model accuracy for sigma={sigma}, lambda={lam}')
    print(f'train: {sum(train_accuracy)/len(train_accuracy)}')
    print(f'val: {sum(val_accuracy)/len(val_accuracy)}\n')


sigmas=[0.5, 1.]
lambdas=[0.1, 1.]

for sigma in sigmas:
    for lam in lambdas:
        train_n_score(datasets, sigma, lam)


Average model accuracy for sigma=0.5, lambda=0.1
train: 0.7619047619047619
val: 0.5752380952380953

Average model accuracy for sigma=0.5, lambda=1.0
train: 0.7614285714285713
val: 0.5761904761904761



TODO: measures against over and underfitting

TODO: interpretation

TODO: Ausblick -> z.B. andere Lern algorithm, ensemble Methoden wie adaboost, ...

TODO: Metrics ->Accuracy, F1 score