In [57]:
import scipy.io
data = scipy.io.loadmat('../../data/url.mat')

**create list for labels and data, where one entry is the data for the day with this index**

In [58]:
import numpy as np

num_of_days = 120
X, Y = [], []

for i in range(num_of_days):
    day_data = data["Day" + str(i)]
    X.append(day_data[0][0][0])
    Y.append(day_data[0][0][1])

In [59]:
for i in range(len(Y)):
    Y[i] = Y[i].ravel()

**continous learn classifier**

In [60]:
# incremental learns classifier (must have partial_fit() function)
# returns an array of cumulative error rates for each day
def learn_incremental(clf, batch_size = 1000):
    
    #print("Batch size {}".format(batch_size))
    error_rates = []
    num_of_days = 120
    err = 0
    
    for curr_day in range(num_of_days): # looping through days
    
        if (curr_day != 45):
            X_curr_day = X[curr_day]
    
            # split the data in slices of batch_size
            batches_amount = int(X_curr_day.shape[0] / batch_size)
            Y_curr_day = np.array_split(Y[curr_day], batches_amount)
    
            for j in range(batches_amount): # looping through individual urls
                select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        
                X_curr_url_batch, Y_curr_url_batch = X_curr_day[select_ind,:], Y_curr_day[j] 
        
                if (j > 0):
                    Y_preds = clf.predict(X_curr_url_batch)
            
                    for k in range(batch_size):
                        if(Y_preds[k] != Y_curr_url_batch[k]):
                            err = err + 1
        
                clf.partial_fit(X_curr_url_batch, Y_curr_url_batch, classes=list(range(2))) # Continous fitting of urls and label
            
            print("Log: Day {}: {}".format(curr_day ,err / X_curr_day.shape[0]))
            error_rates.append(err / X_curr_day.shape[0])
            err = 0
    return error_rates

In [61]:
from scipy.sparse import vstack
from scipy.sparse import csr_matrix

# takes a range of days (start until to)
# fits the model with the data from the range until "to" itself
# for day "to" for each url_batch the data is refitted (prevoius days data from range + all batches up to current)
# and predicticed for the succesive batch of urls
# returns the cumulative error rate for day "to"
# batch_size determines the size of url_batches for which data is predicted and fitted on the "to" day
def train_and_evaluate(start, to, clf, batch_size = 1):
    
    
    prev_x = X[0][0,:] #random row for initialization purposes, spliced off later before classifying
    
    prev_y = []
    for prev_day in range(start, to):
        
        #print("prev_x:")
        #print(prev_x.shape)
        #print(X[prev_day].shape)
        prev_x = vstack((prev_x, X[prev_day])) # stack up all matrices to previous day
        prev_y = np.concatenate((prev_y, Y[prev_day])) # stack up all labels to previous day
        #print(prev_x.shape)
        
    # immediately splice off the first initial url used to initiate the matrix outside of the loop
    url_indexes_without_initial = np.arange(1, prev_x.shape[0])
    prev_x = prev_x.tocsr()[url_indexes_without_initial,:]
    #print(prev_x.shape)
    
    # change X to row format for faster slicing row-wise.
    curr_day_x = X[to].tocsr()
    
    # split the data in slices of batch_size
    batches_amount = int(curr_day_x.shape[0] / batch_size)
    curr_day_y = np.array_split(Y[to], batches_amount)
    
    err = 0
    x_batches = X[0][0,:] #random row for initialization purposes, spliced off later before classifying
    y_batches = []
    for j in range(batches_amount): # looping through individual url-batches
       
        # Combine previous days data and all batches up until current
        #print("parts:")
        #print("current batches: {}".format(x_batches.shape))
        #print("previous: {}".format(prev_x.shape))
        x_combined = vstack((prev_x, x_batches))
        #y_combined = prev_y.extend(y_batches.ravel())
        y_combined = np.append(prev_y, y_batches)
        #print("unsliced comb:  {}".format(x_combined.shape))
        
        if (j == 0):
            # immediately splice off the trailing url used to initiate the matrix outside of the loop
            url_indexes_without_trailing = np.arange(0, prev_x.shape[0])
            x_combined = x_combined.tocsr()[url_indexes_without_trailing,:]
            
            
        #print("sliced comb:  {}".format(x_combined.shape))
        #print("y-sliced comb:  {}".format(len(y_combined)))
        
        
        # Train for cumulated data excluding current batch
        if (x_combined.shape[0] != 0):
            clf.fit(x_combined, y_combined)
    
        # splice current batch off
        select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        curr_x_batch, curr_y_batch = curr_day_x[select_ind,:], curr_day_y[j] 
        
        # Add current batch to cumulated list of batches
        x_batches = vstack((x_batches, curr_x_batch))
        if (j == 0):
            # immediately splice off the first initial url used to initiate the matrix outside of the loop
            url_indexes_without_initial = np.arange(1, x_batches.shape[0])
            x_batches = x_batches.tocsr()[url_indexes_without_initial,:]
            
        y_batches.extend(curr_y_batch)
        
        # Predict for current batch
        if (x_combined.shape[0] != 0):
            Y_preds = clf.predict(curr_x_batch)
           
            # Collect errors
            # todo replace with accuracy score
            for k in range(batch_size):
                if(Y_preds[k] != curr_y_batch[k]):
                    err = err + 1
        
    return err / curr_day_x.shape[0] # Return cumulative error rate

### Batch-size and other global variables and imports:

In [62]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

batch_size = 4000
train_set_size = 17 # Determines on data of how many days training is performed for SVM-multi and SVM-multi-once

# The error rates of the different classifiers
error_rates_pa = None
error_rates_svm_once = None
error_rates_svm_daily = None
error_rates_svm_multi_once = None
error_rates_svm_multi = None

In [1]:
#svm mult-once
clf = LinearSVC(C=0.01)
error_rates_svm_multi_once =  []

for i in range(train_set_size):
    rate = train_and_evaluate(0, i, clf, batch_size)
    error_rates_svm_multi_once.append(rate)
    print("Error-rate Day {}   : {}".format(i, rate))

NameError: name 'LinearSVC' is not defined

### SVM-once
**Evaluate for all days**

In [None]:
error_rates_svm_once = []
rate = train_and_evaluate(0, 0, clf, batch_size)
print("Error-rate Day {}   : {}".format(0, rate))
error_rates_svm_once.append(rate)

for i in range(1, num_of_days):
    Y_preds = clf.predict(X[i])
    rate = 1 - accuracy_score(Y[i], Y_preds)
    error_rates_svm_once.append(rate)
    print("Error-rate Day {}   : {}".format(i,rate))

### SVM-daily
**Train on data of previous day and predict of successive day**

In [None]:
clf = LinearSVC(C=0.01)

error_rates_svm_daily = []
error_rates_svm_daily.append(train_and_evaluate(0, 0, clf, batch_size))
print("Error-rate Day {}   : {}".format(0, error_rates_svm_daily[0]))
for i in range(1, num_of_days - 1):
    # i being the current day.
    clf.fit(X[i], Y[i])
    
    # i + 1 being the next day on which the model is being tested on. 
    Y_preds = clf.predict(X[i + 1])
    rate = 1 - accuracy_score(Y[i + 1], Y_preds)
    error_rates_svm_daily.append(rate)
    print("Error-rate Day {}   : {}".format(i + 1, rate))

### SVM-multi-once

**Train once on data for days 0-16 (train_set_size) and (evaluate for those days)**

**Evaluate for the remaining days**

In [None]:
for i in range(train_set_size, num_of_days):
    Y_preds = clf.predict(X[i])
    rate = 1 - accuracy_score(Y[i], Y_preds)
    error_rates_svm_multi_once.append(rate)
    print("Error-rate Day {}   : {}".format(i,rate))

### SVM-multi
**Train on data of previous 0-16 days (train_set_size) and predict of successive day**

In [None]:
clf = LinearSVC(C=0.01)
error_rates_svm_multi = []
for curr_day in range(0, num_of_days):
    
    lower_bound = max(0, ((curr_day - 1) - train_set_size))
    upper_bound = curr_dayclf = LinearSVC(C=0.01)
error_rates_svm_multi = []
for curr_day in range(0, num_of_days):
    
    lower_bound = max(0, ((curr_day - 1) - train_set_size))
    upper_bound = curr_day
    rate = train_and_evaluate(lower_bound, curr_day, clf, batch_size)
    error_rates_svm_multi.append(rate)
    print("Error-rate Day {}   : {}".format(curr_day, rate))
    rate = train_and_evaluate(lower_bound, curr_day, clf, batch_size)
    error_rates_svm_multi.append(rate)
    print("Error-rate Day {}   : {}".format(curr_day, rate))

### Passive Aggressive

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier

clf = PassiveAggressiveClassifier(C=0.001, random_state = 123)

error_rates_pa = learn_incremental(clf, batch_size)

cnt = 0
for x in error_rates_pa:    
    print("Error-rate Day {}   : {}".format(cnt, x))
    cnt = cnt + 1

In [None]:
def _calc_line(x, y):
    
    # create polynomial equation and calculate line
    theta = np.polyfit(x, y, 5)
    return theta[5] + theta[4] * pow(x, 1) + theta[3] * pow(x, 2) + theta[2] * pow(x, 3) + theta[1] * pow(x, 4) + theta[0] * pow(x, 5)

In [None]:
def _plot(y, color, scatter):
    
    # Set up the day_indexes with the missing 45th day in mind
    x = np.arange(0, 120) 
    
    if (y.shape[0] < x.shape[0]):
        #x = np.delete(x, 46)
        y = np.append(y, 0.02) # add dummy point : this is ugly
    y = y * 100
    y = _calc_line(x, y)
    
    plt.plot(x, y, color)
    
    if (scatter):
        plt.scatter(x, y, color)

In [None]:
import matplotlib.pyplot as plt

def plot5degree(error_rates_pa, error_rates_svm_once, error_rates_svm_daily,
                error_rates_svm_multi_once, error_rates_svm_multi
                batch_size, scatter = False):

    
    if (error_rates_pa is not None):
        _plot(np.array(error_rates_pa), 'r', scatter)
        
    if (error_rates_svm_once is not None):
        _plot(np.array(error_rates_svm_once), 'k', scatter)
        
    if (error_rates_svm_daily is not None):
        _plot(np.array(error_rates_svm_daily), 'm'', scatter)
    
    if (error_rates_svm_multi_once is not None):
        _plot(np.array(error_rates_svm_multi_once), 'b', scatter)

    if (error_rates_svm_multi is not None):
        _plot(np.array(error_rates_svm_multi), 'g', scatter)
        
    plt.title('Experiment 1 with batch size {}'.format(batch_size))
    plt.xlabel('Days')
    plt.ylabel('Cumulative error rate')
    plt.ylim([0,4])
    plt.show()

In [None]:
plot5degree(error_rates, batch_size, True)

In [None]:
import matplotlib.pyplot as plt

def plot8degree(error_rates, batch_size, scatter = False):
    
    # Set up the day_indexes with the missing 45th day in mind
    x = np.arange(0, 120)
    y = np.array(error_rates)
    
    if (y.shape[0] < x.shape[0]):
        x = np.delete(x, 46)
    y = y * 100

    # create polynomial equation and calculate line
    theta = np.polyfit(x, y, 8)
    y_line = theta[8] + theta[7] * pow(x, 1) + theta[6] * pow(x, 2) + theta[5] * pow(x, 3) + theta[4] * pow(x, 4) + theta[3] * pow(x, 5) + theta[2] * pow(x, 6) + theta[1] * pow(x, 7) + theta[0] * pow(x, 8)

    if (scatter):
        plt.scatter(x, y)
    
    plt.plot(x, y_line, 'r')
    plt.title('Experiment 1 with batch size {}'.format(batch_size))
    plt.xlabel('Days')
    plt.ylabel('Cumulative error rate')
    plt.ylim([0,4])
    plt.show()

In [None]:
plot8degree(error_rates, batch_size, True)