In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
%matplotlib inline
import math
import timeit

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [2]:
df = pd.read_csv('kyphosis.csv')

In [3]:
df.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Kyphosis  81 non-null     object
 1   Age       81 non-null     int64 
 2   Number    81 non-null     int64 
 3   Start     81 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 2.7+ KB


In [5]:
df.describe()

Unnamed: 0,Age,Number,Start
count,81.0,81.0,81.0
mean,83.654321,4.049383,11.493827
std,58.104251,1.619423,4.883962
min,1.0,2.0,1.0
25%,26.0,3.0,9.0
50%,87.0,4.0,13.0
75%,130.0,5.0,16.0
max,206.0,10.0,18.0


In [6]:
def convert(x):
    if x == 'absent':
        return 0
    else:
        return 1

In [7]:
df['Kyphosis'] = df['Kyphosis'].apply(convert)

In [8]:
df = df.rename(columns = {'Kyphosis':'Class'})

In [9]:
df_0 = df[df['Class'] == 0]

In [10]:
df_1 = df[df['Class'] == 1]

In [11]:
len(df_1)/len(df)

0.20987654320987653

## Stochastic Proximal AUC Maximization

In [12]:
class SPAUC:
    
    def __init__(self, feat,lamb, C,theta):
        self.n_p = 0
        self.n_n = 0
        self.feat = feat
        self.s_p = np.zeros(self.feat)
        self.s_n = np.zeros(self.feat)
        self.w = np.zeros(self.feat)
        self.t = 0
        self.lamb = lamb
        self.C = C
        self.theta = theta

    def grad(self, x, y, p, u, v):
        if y == 1:   
            return(2*(1-p)*sum((x-u)**2)*self.w + 2*p*(1-p)*(v-u) +2*p*(1-p)*sum((u-v)**2)*self.w)
        else:
            return(2*p*sum((x-v)**2)*self.w + 2*p*(1-p)*(v-u) +2*p*(1-p)*sum((u-v)**2)*self.w)
    
    def step(self):
        return 1/(2*self.C*(self.t+1)**self.theta)
    
    def new_w(self, gradient):
        return((self.w-self.step()*gradient)/(2*self.lamb*self.step()+1))
    
    def f(self,x):
        return sum(self.w*x)
    
    def optimize(self, feat_x, y):
        for j in range(len(feat_x)):
            if y[j] == 1:
                self.n_p += 1
                self.s_p += feat_x[j,]
            else:
                self.n_n += 1
                self.s_n += feat_x[j,]
        
            if self.n_p == 0:
                u = np.zeros(self.feat)
            else:
                u = self.s_p/self.n_p
            if self.n_n == 0:
                v = np.zeros(self.feat)
            else:
                v = self.s_n/self.n_n
            
            ind = random.choices(range(len(feat_x)), k = 1)
            g = self.grad(feat_x[ind,], y[ind], self.n_p/(self.t+1), u, v)
            self.w = self.new_w(g)
            self.t += 1
        return(self.w)
    
    def AUC(self,X_test,y_test):
        pred = []
        for i in range(len(X_test)):
            pred.append(self.f(X_test[i,]))
        return roc_auc_score(y_test, pred)
            

In [13]:
def C(df, lamb):
    scaler.fit(df)
    df = scaler.transform(df)
    k_sq = max(1, 16*max(np.square(df).sum(axis=1)))
    return max(4*lamb, k_sq)

**Scaling data**

In [16]:
scaler.fit(df.drop('Class', axis = 1))
data_0 = scaler.transform(df_0.drop('Class', axis = 1))
data_1 = scaler.transform(df_1.drop('Class', axis = 1))

**Cross-Validation on lambda and theta**

In [19]:
lamb = [0,.0001,.001,.01,.1,1,10,100,1000,10000]
theta = [0.5,0.75,1]
auc = np.zeros([len(lamb),len(theta)])
x = 50
y = 3
for k in range(x):
    
    X_train0, X_test0, y_train0, y_test0 = train_test_split(data_0, np.zeros(len(data_0)), test_size = .3)
    X_train1, X_test1, y_train1, y_test1 = train_test_split(data_1, np.ones(len(data_1)), test_size = .3)
    X_train = np.concatenate([X_train0, X_train1])
    y_train = np.concatenate([y_train0, y_train1])
    X_test = np.concatenate([X_test0, X_test1])
    y_test = np.concatenate([y_test0, y_test1])
    
    #randomize
    ind = random.choices(range(len(X_train)), k = len(X_train))
    X_train = X_train[ind]
    y_train = y_train[ind]
    
    for i in range(len(lamb)):
        C1 = C(df.drop('Class', axis = 1), lamb[i])
        for j in range(len(theta)):
            s = SPAUC(y, lamb[i], C1, theta[j])
            a = [0]
            while True:
                s.optimize(X_train,y_train)
                a.append(s.AUC(X_train,y_train))
                if abs(a[-1] - a[-2]) <.001:
                    break
            auc[i,j] += s.AUC(X_test, y_test)
auc = auc/x

In [20]:
auc

array([[0.80466667, 0.8005    , 0.79466667],
       [0.80466667, 0.8005    , 0.79466667],
       [0.80466667, 0.8005    , 0.79483333],
       [0.80466667, 0.80066667, 0.79466667],
       [0.80483333, 0.80033333, 0.79466667],
       [0.80583333, 0.8005    , 0.79483333],
       [0.80533333, 0.8035    , 0.796     ],
       [0.80533333, 0.806     , 0.80033333],
       [0.80533333, 0.80516667, 0.8005    ],
       [0.80533333, 0.80516667, 0.8005    ]])

**Best Parameters: lambda = 100 and theta = .75**

In [21]:
lamb = 100
theta = .75
C1 = C(df.drop('Class', axis = 1), lamb)
x = 150
y = 3
auc = np.zeros(150)
run_time = np.zeros(150)
for k in range(150):
    
    X_train0, X_test0, y_train0, y_test0 = train_test_split(data_0, np.zeros(len(data_0)), test_size = .3)
    X_train1, X_test1, y_train1, y_test1 = train_test_split(data_1, np.ones(len(data_1)), test_size = .3)
    X_train = np.concatenate([X_train0, X_train1])
    y_train = np.concatenate([y_train0, y_train1])
    X_test = np.concatenate([X_test0, X_test1])
    y_test = np.concatenate([y_test0, y_test1])
    
    #randomize
    ind = random.choices(range(len(X_train)), k = len(X_train))
    X_train = X_train[ind,]
    y_train = y_train[ind]
    
    
    s = SPAUC(y, lamb, C1, theta)
    a = [0]
    
    start = timeit.default_timer()
    while True:
        s.optimize(X_train,y_train)
        a.append(s.AUC(X_train,y_train))
        if abs(a[-1] - a[-2]) <.001:
            break
    stop = timeit.default_timer()
    auc[k] = s.AUC(X_test,y_test)
    run_time[k] = stop-start

In [22]:
np.mean(auc)

0.8285555555555555

In [23]:
np.std(auc)

0.07538387357427137

In [24]:
auc

array([0.775     , 0.83333333, 0.78333333, 0.80833333, 0.725     ,
       0.9       , 0.86666667, 0.68333333, 0.80833333, 0.89166667,
       0.75      , 0.81666667, 0.875     , 0.75833333, 0.825     ,
       0.79166667, 0.93333333, 0.80833333, 0.85      , 0.86666667,
       0.98333333, 0.70833333, 0.775     , 0.86666667, 0.71666667,
       0.90833333, 0.93333333, 0.78333333, 0.9       , 0.675     ,
       0.74166667, 0.86666667, 0.85      , 0.94166667, 0.76666667,
       0.775     , 0.85833333, 0.80833333, 0.83333333, 0.94166667,
       0.9       , 0.85833333, 0.8       , 0.91666667, 0.81666667,
       0.75      , 0.75833333, 0.81666667, 0.74166667, 0.71666667,
       0.75833333, 0.825     , 0.83333333, 1.        , 0.80833333,
       0.79166667, 0.9       , 0.80833333, 0.76666667, 0.85833333,
       0.90833333, 0.73333333, 0.84166667, 0.71666667, 0.78333333,
       0.83333333, 0.83333333, 0.80833333, 0.875     , 0.89166667,
       0.90833333, 0.83333333, 0.76666667, 0.79166667, 0.89166

In [25]:
run_time.mean()

0.009854876073335768

In [26]:
run_time.std()

0.004887744370943725

In [27]:
run_time

array([0.03536418, 0.0083234 , 0.01010707, 0.00564105, 0.00519891,
       0.01239758, 0.00479792, 0.0070981 , 0.00980665, 0.00589774,
       0.00736239, 0.01578507, 0.0094723 , 0.00492519, 0.00936945,
       0.00945886, 0.00683794, 0.00939702, 0.00695938, 0.00682509,
       0.01088944, 0.00647618, 0.01147874, 0.01744123, 0.01397517,
       0.03027803, 0.01191231, 0.00931882, 0.0076292 , 0.00456315,
       0.00467158, 0.00733297, 0.00911906, 0.00470662, 0.01401508,
       0.00943772, 0.01160545, 0.00459009, 0.00648535, 0.00466425,
       0.00903148, 0.01122421, 0.01420216, 0.01170379, 0.00698661,
       0.0117935 , 0.00467078, 0.00935622, 0.00457091, 0.00678957,
       0.00449806, 0.00940995, 0.01554726, 0.00940127, 0.01187473,
       0.00465018, 0.01526667, 0.01520556, 0.00999949, 0.00467438,
       0.0095511 , 0.00706802, 0.00519338, 0.01209202, 0.00728825,
       0.00914923, 0.00906854, 0.00721458, 0.00922575, 0.00665601,
       0.00658817, 0.00468456, 0.01193229, 0.00682203, 0.01203