In [1]:
import scipy.io
data = scipy.io.loadmat('../../data/url.mat')
#'../../data/url.mat'

**create list for labels and data, where one entry is the data for the day with this index**

In [2]:
import numpy as np

num_of_days = 120
X, Y = [], []

for i in range(num_of_days):
    day_data = data["Day" + str(i)]
    X.append(day_data[0][0][0])
    Y.append(day_data[0][0][1])

In [3]:
for i in range(len(Y)):
    Y[i] = Y[i].ravel()

**continous learn classifier**

In [4]:
# incremental learns classifier (must have partial_fit() function)
# returns an array of cumulative error rates for each day
def learn_incremental(clf, batch_size = 1000):
    
    #print("Batch size {}".format(batch_size))
    error_rates = []
    num_of_days = 120
    err = 0
    
    for curr_day in range(num_of_days): # looping through days
    
        if (curr_day != 45):
            X_curr_day = X[curr_day]
    
            # split the data in slices of batch_size
            batches_amount = int(X_curr_day.shape[0] / batch_size)
            Y_curr_day = np.array_split(Y[curr_day], batches_amount)
    
            for j in range(batches_amount): # looping through individual urls
                select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        
                X_curr_url_batch, Y_curr_url_batch = X_curr_day[select_ind,:], Y_curr_day[j] 
        
                if (j > 0):
                    Y_preds = clf.predict(X_curr_url_batch)
            
                    for k in range(batch_size):
                        if(Y_preds[k] != Y_curr_url_batch[k]):
                            err = err + 1
        
                clf.partial_fit(X_curr_url_batch, Y_curr_url_batch, classes=list(range(2))) # Continous fitting of urls and label
            
            print("Log: Day {}: {}".format(curr_day ,err / X_curr_day.shape[0]))
            error_rates.append(err / X_curr_day.shape[0])
            err = 0
    return error_rates

In [5]:
# incremental learns classifier (must have partial_fit() function)
# returns an array of cumulative error rates for each day
def learn_interval(clf):
    
    error_rates = []
    num_of_days = 120
    err = 0
    
    for curr_day in range(num_of_days): # looping through days
    
        if (curr_day != 45):
            X_curr_day = X[curr_day]
            Y_curr_day = Y[curr_day]
        
            if (curr_day > 0):
                Y_preds = clf.predict(X_curr_day)
            
                for k in range(len(Y_preds)):
                    if(Y_preds[k] != Y_curr_day[k]):
                        err = err + 1
        
            
            clf.partial_fit(X_curr_day, Y_curr_day.ravel(), classes=list(range(2))) # Continous fitting of urls and label
            
            error_rates.append(err / X_curr_day.shape[0])
            err = 0
    return error_rates

### Batch-size and other global variables and imports:

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
import matplotlib.pyplot as plt

batch_size = 4

# Initialize error rates of the different classifiers
error_rates_pa = None
error_rates_percep = None
error_rates_pa_interval = None
error_rates_percep_interval = None

### Perceptron

In [None]:
clf = Perceptron(random_state = 123)

error_rates_percep = learn_incremental(clf, batch_size)

cnt = 0
for x in error_rates_percep:    
    print("Error-rate Day {}   : {}".format(cnt, x))
    cnt = cnt + 1

### Passive Aggressive

In [None]:
clf = PassiveAggressiveClassifier(C=0.001, random_state = 123)

error_rates_pa = learn_incremental(clf, batch_size)

cnt = 0
for x in error_rates_pa:    
    print("Error-rate Day {}   : {}".format(cnt, x))
    cnt = cnt + 1

### Perceptron Interval

In [None]:
clf = Perceptron(random_state = 123)

error_rates_percep_interval = learn_interval(clf)

cnt = 0
for x in error_rates_percep_interval:    
    print("Error-rate Day {}   : {}".format(cnt, x))
    cnt = cnt + 1

### Passive Aggressive Interval

In [None]:
clf = PassiveAggressiveClassifier(C=0.001, random_state = 123)

error_rates_pa_interval = learn_interval(clf)

cnt = 0
for x in error_rates_pa_interval:    
    print("Error-rate Day {}   : {}".format(cnt, x))
    cnt = cnt + 1

In [None]:
def cleanup(x, y):
    
    # Cleanup outliers in data
    outliers = []
    for i in range(len(y)):
        if (y[i] > 5 or y[i] < 0.5):
            outliers = np.append(outliers, i)
    
    offset = 0
    for outl in outliers:
        y = np.delete(y, int(outl) - offset)
        x = x[:-1]
        offset = offset + 1
        
    return x, y

In [None]:
def _calc_line(x, y):
    
    # create polynomial equation and calculate line
    theta = np.polyfit(x, y, 8)
    return theta[8] + theta[7] * pow(x, 1) + theta[6] * pow(x, 2) + theta[5] * pow(x, 3) + theta[4] * pow(x, 4) + theta[3] * pow(x, 5) + theta[2] * pow(x, 6) + theta[1] * pow(x, 7) + theta[0] * pow(x, 8)

In [None]:
def _plot(y, color, marker, linestyle, scatter, label):
    
    # Set up the day_indexes with the missing 45th day in mind
    x = np.arange(0, 120) 
    
    # attend to data size imbalances (day 45 outages)
    if (y.shape[0] < x.shape[0]):
        x = x[:-1]
        
    y = y * 100
    x, y = cleanup(x, y)
    
    if (scatter):
        plt.scatter(x, y, c=color)
    
    y = _calc_line(x, y)
    plt.plot(x, y, "{}{}{}".format(marker, color, linestyle), markevery=5, label=label)

In [None]:
def plot8degree(error_rates_pa, error_rates_percep,
                error_rates_pa_interval, error_rates_percep_interval,
                batch_size, scatter = False):

    if (error_rates_pa is not None):
        _plot(np.array(error_rates_pa), 'r', 'v', '-', scatter, "PA")
        
    if (error_rates_percep is not None):
        _plot(np.array(error_rates_percep), 'g', 'o', '-', scatter, "Perceptron")
        
    if (error_rates_percep_interval is not None):
        _plot(np.array(error_rates_percep_interval), 'b', 'x', '--', scatter, "Percep. (int.)")
              
    if (error_rates_pa_interval is not None):
        _plot(np.array(error_rates_pa_interval), 'm', '^', '--',scatter, "PA (int.)")
        
    title = 'Experiment 3 (interval) with batch size {}'.format(batch_size)
    plt.title(title)
    plt.xlabel('Days')
    plt.ylabel('Cumulative error rate (%)')
    plt.ylim([1,4])
    plt.xlim([0,100])
    plt.legend()
    plt.savefig("{}.svg".format(title), format='svg', dpi=1200)

In [None]:
plot8degree(error_rates_pa, error_rates_percep,
                error_rates_pa_interval, error_rates_percep_interval, batch_size, False)

In [None]:
## Does benefit from cleanup