In [1]:
import scipy.io
data = scipy.io.loadmat('../../url.mat')

**create list for labels and data, where one entry is the data for the day with this index**

In [2]:
import numpy as np

num_of_days = 120
X, Y = [], []

for i in range(num_of_days):
    day_data = data["Day" + str(i)]
    X.append(day_data[0][0][0])
    Y.append(day_data[0][0][1])

**flatten y to not be a nested array**

In [3]:
# y-data are lists of numpy-arrays

for i in range(len(Y)):
    Y[i] = [element for sublist in Y[i] for element in sublist]
   
# y-data becomes a list of lists

### Train once on data for day 0

In [4]:
from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X[0], Y[0])



LinearSVC()

In [5]:
from scipy.sparse import vstack
from scipy.sparse import csr_matrix
import numpy as np

# takes a range of days (start until to)
# fits the model with the data from the range until "to" itself
# for day "to" for each url_batch the data is refitted (prevoius days data from range + all batches up to current)
# and predicticed for the succesive batch of urls
# returns the cumulative error rate for day "to"
# batch_size determines the size of url_batches for which data is predicted and fitted on the "to" day
def train_and_evaluate(start, to, clf, batch_size = 1):
    
    prev_x = X[0][0,:] #random row for initialization purposes, spliced off later before classifying
    prev_y = []
    for prev_day in range(start, to):
        prev_x = vstack((prev_x, X[prev_day])) # stack up all matrices to previous day
        prev_y = np.concatenate((prev_y, Y[prev_day])) # stack up all labels to previous day
        
        
    # immediately splice off the first initial url used to initiate the matrix outside of the loop
    url_indexes_without_initial = np.arange(1, prev_x.shape[0])
    prev_x = prev_x.tocsr()[url_indexes_without_initial,:]
    
    # change X to row format for faster slicing row-wise.
    curr_day_x = X[to].tocsr()
    
    # split the data in slices of batch_size
    batches_amount = int(curr_day_x.shape[0] / batch_size)
    curr_day_y = np.array_split(Y[to], batches_amount)
    
    err = 0
    x_batches = X[0][0,:] #random row for initialization purposes, spliced off later before classifying
    y_batches = []
    for j in range(batches_amount): # looping through individual url-batches
       
        # Combine previous days data and all batches up until current
        print(type(prev_x))
        x_combined = vstack(prev_x, x_batches)
        y_combined = prev_y.extend(y_batches.ravel())
        
        if (j == 0):
            # immediately splice off the first initial url used to initiate the matrix outside of the loop
            x_combined = x_combined[np.arange(1, x_combined.shape[0]),:]
            
        # Train for cumulated data excluding current batch
        clf.fit(x_combined, y_combined)
    
        # splice current batch off
        select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        curr_x_batch, curr_y_batch = curr_day_x[select_ind,:], curr_day_y[j] 
        
        # Add current batch to cumulated list of batches
        x_batches = vstack(x_batches, curr_x_batch)
        y_batches.extend(curr_y_batch)
        
        # Predict for current batch
        Y_preds = clf.predict(curr_x_batch)
        
        # if(j > 0):    
        # Collect errors
        # todo replace with accuracy score
        for k in range(batch_size):
            if(Y_preds[k] != curr_y_batch[k]):
                err = err + 1
        
    
        #clf.partial_fit(X_curr_url_batch, Y_curr_url_batch, classes=list(range(2))) # Continous fitting of urls and label

        #print("Error-rate Day {}   : {}".format(i,err / X_curr_day.shape[0]))
    return err / curr_day_x.shape[0]


### Evaluate for all days

In [6]:
from sklearn.metrics import accuracy_score

#print("Error-rate Day {}   : {}".format(0, train_and_evaluate(0, 1, clf, 1000)))

for i in range(1, num_of_days):
    Y_preds = clf.predict(X[i])
    print("Error-rate Day {}   : {}".format(i,1 - accuracy_score(Y[i], Y_preds)))
    
    
# dieser ufunc error ist derselbe der mich schon gestern abend genervt hat.
# erstmal ignorieren i guess

Error-rate Day 1   : 0.02554999999999996
Error-rate Day 2   : 0.019000000000000017
Error-rate Day 3   : 0.021649999999999947
Error-rate Day 4   : 0.026050000000000018
Error-rate Day 5   : 0.023050000000000015
Error-rate Day 6   : 0.02510000000000001
Error-rate Day 7   : 0.030750000000000055
Error-rate Day 8   : 0.028449999999999975
Error-rate Day 9   : 0.03354999999999997
Error-rate Day 10   : 0.03090000000000004
Error-rate Day 11   : 0.02980000000000005
Error-rate Day 12   : 0.023850000000000038
Error-rate Day 13   : 0.019449999999999967
Error-rate Day 14   : 0.026800000000000046
Error-rate Day 15   : 0.0242
Error-rate Day 16   : 0.027599999999999958
Error-rate Day 17   : 0.022750000000000048
Error-rate Day 18   : 0.023599999999999954
Error-rate Day 19   : 0.021299999999999986
Error-rate Day 20   : 0.026549999999999963
Error-rate Day 21   : 0.03474999999999995
Error-rate Day 22   : 0.03325
Error-rate Day 23   : 0.03334999999999999
Error-rate Day 24   : 0.03554999999999997
Error-rate D