In [5]:
import scipy.io
data = scipy.io.loadmat('../../data/url.mat')

**create list for labels and data, where one entry is the data for the day with this index**

In [6]:
import numpy as np

num_of_days = 120
X, Y = [], []

for i in range(num_of_days):
    day_data = data["Day" + str(i)]
    X.append(day_data[0][0][0])
    Y.append(day_data[0][0][1])

**flatten y to not be a nested array**

In [19]:
# y-data are lists of numpy-arrays

for i in range(len(Y)):
    Y[i] = [element for sublist in Y[i] for element in sublist]
   
# y-data becomes a list of lists

In [20]:
from scipy.sparse import vstack
from scipy.sparse import csr_matrix
import numpy as np

# takes a range of days (start until to)
# fits the model with the data from the range until "to" itself
# for day "to" for each url_batch the data is refitted (prevoius days data from range + all batches up to current)
# and predicticed for the succesive batch of urls
# returns the cumulative error rate for day "to"
# batch_size determines the size of url_batches for which data is predicted and fitted on the "to" day
def train_and_evaluate(start, to, clf, batch_size = 1):
    
    prev_x = X[0][0,:] #random row for initialization purposes, spliced off later before classifying
    
    prev_y = []
    for prev_day in range(start, to):
        
        print("prev_x:")
        print(prev_x.shape)
        print(X[prev_day].shape)
        prev_x = vstack((prev_x, X[prev_day])) # stack up all matrices to previous day
        prev_y = np.concatenate((prev_y, Y[prev_day])) # stack up all labels to previous day
        print(prev_x.shape)
        
    # immediately splice off the first initial url used to initiate the matrix outside of the loop
    url_indexes_without_initial = np.arange(1, prev_x.shape[0])
    prev_x = prev_x.tocsr()[url_indexes_without_initial,:]
    print(prev_x.shape)
    
    # change X to row format for faster slicing row-wise.
    curr_day_x = X[to].tocsr()
    
    # split the data in slices of batch_size
    batches_amount = int(curr_day_x.shape[0] / batch_size)
    curr_day_y = np.array_split(Y[to], batches_amount)
    
    err = 0
    x_batches = X[0][0,:] #random row for initialization purposes, spliced off later before classifying
    y_batches = []
    for j in range(batches_amount): # looping through individual url-batches
       
        # Combine previous days data and all batches up until current
        print("parts:")
        print("current batches: {}".format(x_batches.shape))
        print("previous: {}".format(prev_x.shape))
        x_combined = vstack((prev_x, x_batches))
        #y_combined = prev_y.extend(y_batches.ravel())
        y_combined = np.append(prev_y, y_batches)
        print("unsliced comb:  {}".format(x_combined.shape))
        
        if (j == 0):
            # immediately splice off the trailing url used to initiate the matrix outside of the loop
            url_indexes_without_trailing = np.arange(0, prev_x.shape[0])
            x_combined = x_combined.tocsr()[url_indexes_without_trailing,:]
            
            
        print("sliced comb:  {}".format(x_combined.shape))
        print("y-sliced comb:  {}".format(len(y_combined)))
        
        
        # Train for cumulated data excluding current batch
        if (x_combined.shape[0] != 0):
            clf.fit(x_combined, y_combined)
    
        # splice current batch off
        select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        curr_x_batch, curr_y_batch = curr_day_x[select_ind,:], curr_day_y[j] 
        
        # Add current batch to cumulated list of batches
        x_batches = vstack((x_batches, curr_x_batch))
        if (j == 0):
            # immediately splice off the first initial url used to initiate the matrix outside of the loop
            url_indexes_without_initial = np.arange(1, x_batches.shape[0])
            x_batches = x_batches.tocsr()[url_indexes_without_initial,:]
            
        y_batches.extend(curr_y_batch)
        
        # Predict for current batch
        if (x_combined.shape[0] != 0):
            Y_preds = clf.predict(curr_x_batch)
           
            # Collect errors
            # todo replace with accuracy score
            for k in range(batch_size):
                if(Y_preds[k] != curr_y_batch[k]):
                    err = err + 1
        
    return err / curr_day_x.shape[0] # Return cumulative error rate

### Train on data of previous day and predict of successive day

In [21]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

clf = LinearSVC(C=0.01)

print("Error-rate Day {}   : {}".format(0, train_and_evaluate(0, 0, clf, 1000)))

for i in range(1, num_of_days - 1):
    # i being the current day.
    clf.fit(X[i], Y[i])
    
    # i + 1 being the next day on which the model is being tested on. 
    Y_preds = clf.predict(X[i + 1])
    print("Error-rate Day {}   : {}".format(i + 1,1 - accuracy_score(Y[i + 1], Y_preds)))

(0, 3231961)
parts:
current batches: (1, 3231961)
previous: (0, 3231961)
unsliced comb:  (1, 3231961)
sliced comb:  (0, 3231961)
y-sliced comb:  0
parts:
current batches: (1000, 3231961)
previous: (0, 3231961)
unsliced comb:  (1000, 3231961)
sliced comb:  (1000, 3231961)
y-sliced comb:  1000
parts:
current batches: (2000, 3231961)
previous: (0, 3231961)
unsliced comb:  (2000, 3231961)
sliced comb:  (2000, 3231961)
y-sliced comb:  2000




parts:
current batches: (3000, 3231961)
previous: (0, 3231961)
unsliced comb:  (3000, 3231961)
sliced comb:  (3000, 3231961)
y-sliced comb:  3000
parts:
current batches: (4000, 3231961)
previous: (0, 3231961)
unsliced comb:  (4000, 3231961)
sliced comb:  (4000, 3231961)
y-sliced comb:  4000




parts:
current batches: (5000, 3231961)
previous: (0, 3231961)
unsliced comb:  (5000, 3231961)
sliced comb:  (5000, 3231961)
y-sliced comb:  5000




parts:
current batches: (6000, 3231961)
previous: (0, 3231961)
unsliced comb:  (6000, 3231961)
sliced comb:  (6000, 3231961)
y-sliced comb:  6000




parts:
current batches: (7000, 3231961)
previous: (0, 3231961)
unsliced comb:  (7000, 3231961)
sliced comb:  (7000, 3231961)
y-sliced comb:  7000




parts:
current batches: (8000, 3231961)
previous: (0, 3231961)
unsliced comb:  (8000, 3231961)
sliced comb:  (8000, 3231961)
y-sliced comb:  8000




parts:
current batches: (9000, 3231961)
previous: (0, 3231961)
unsliced comb:  (9000, 3231961)
sliced comb:  (9000, 3231961)
y-sliced comb:  9000




parts:
current batches: (10000, 3231961)
previous: (0, 3231961)
unsliced comb:  (10000, 3231961)
sliced comb:  (10000, 3231961)
y-sliced comb:  10000




parts:
current batches: (11000, 3231961)
previous: (0, 3231961)
unsliced comb:  (11000, 3231961)
sliced comb:  (11000, 3231961)
y-sliced comb:  11000




parts:
current batches: (12000, 3231961)
previous: (0, 3231961)
unsliced comb:  (12000, 3231961)
sliced comb:  (12000, 3231961)
y-sliced comb:  12000




parts:
current batches: (13000, 3231961)
previous: (0, 3231961)
unsliced comb:  (13000, 3231961)
sliced comb:  (13000, 3231961)
y-sliced comb:  13000




parts:
current batches: (14000, 3231961)
previous: (0, 3231961)
unsliced comb:  (14000, 3231961)
sliced comb:  (14000, 3231961)
y-sliced comb:  14000




parts:
current batches: (15000, 3231961)
previous: (0, 3231961)
unsliced comb:  (15000, 3231961)
sliced comb:  (15000, 3231961)
y-sliced comb:  15000




Error-rate Day 0   : 0.0248125




Error-rate Day 2   : 0.02024999999999999




Error-rate Day 3   : 0.022800000000000042




Error-rate Day 4   : 0.02354999999999996




Error-rate Day 5   : 0.020850000000000035




Error-rate Day 6   : 0.01685000000000003




Error-rate Day 7   : 0.02375000000000005




Error-rate Day 8   : 0.021399999999999975




Error-rate Day 9   : 0.027549999999999963




Error-rate Day 10   : 0.027449999999999974




Error-rate Day 11   : 0.020399999999999974




Error-rate Day 12   : 0.022649999999999948




Error-rate Day 13   : 0.016000000000000014




Error-rate Day 14   : 0.02485000000000004




Error-rate Day 15   : 0.018750000000000044




Error-rate Day 16   : 0.021399999999999975




Error-rate Day 17   : 0.01770000000000005




Error-rate Day 18   : 0.019850000000000034




Error-rate Day 19   : 0.017249999999999988




Error-rate Day 20   : 0.02124999999999999




Error-rate Day 21   : 0.031000000000000028




Error-rate Day 22   : 0.01880000000000004




Error-rate Day 23   : 0.02564999999999995




Error-rate Day 24   : 0.03420000000000001




Error-rate Day 25   : 0.029000000000000026




Error-rate Day 26   : 0.03380000000000005




Error-rate Day 27   : 0.038799999999999946




Error-rate Day 28   : 0.024349999999999983




Error-rate Day 29   : 0.026599999999999957




Error-rate Day 30   : 0.03215000000000001




Error-rate Day 31   : 0.030000000000000027




Error-rate Day 32   : 0.022750000000000048




Error-rate Day 33   : 0.023249999999999993




Error-rate Day 34   : 0.02510000000000001




Error-rate Day 35   : 0.25175000000000003




Error-rate Day 36   : 0.01990000000000003




Error-rate Day 37   : 0.0




Error-rate Day 38   : 0.35645000000000004




Error-rate Day 39   : 0.03500000000000003




Error-rate Day 40   : 0.02795000000000003




Error-rate Day 41   : 0.024700000000000055




Error-rate Day 42   : 0.032950000000000035




Error-rate Day 43   : 0.026150000000000007




Error-rate Day 44   : 0.01685000000000003




Error-rate Day 45   : 0.023076923076923106
Error-rate Day 46   : 0.07640000000000002




Error-rate Day 47   : 0.024950000000000028




Error-rate Day 48   : 0.03444999999999998




Error-rate Day 49   : 0.030100000000000016




Error-rate Day 50   : 0.027649999999999952




Error-rate Day 51   : 0.02344999999999997




Error-rate Day 52   : 0.03005000000000002




Error-rate Day 53   : 0.03480000000000005




Error-rate Day 54   : 0.024499999999999966




Error-rate Day 55   : 0.02749999999999997




Error-rate Day 56   : 0.02585000000000004




Error-rate Day 57   : 0.03154999999999997




Error-rate Day 58   : 0.020499999999999963




Error-rate Day 59   : 0.021700000000000053




Error-rate Day 60   : 0.02354999999999996




Error-rate Day 61   : 0.01915




Error-rate Day 62   : 0.025800000000000045




Error-rate Day 63   : 0.02839999999999998




Error-rate Day 64   : 0.019100000000000006




Error-rate Day 65   : 0.02785000000000004




Error-rate Day 66   : 0.03059999999999996




Error-rate Day 67   : 0.02529999999999999




Error-rate Day 68   : 0.02080000000000004




Error-rate Day 69   : 0.021750000000000047




Error-rate Day 70   : 0.023499999999999965




Error-rate Day 71   : 0.02705000000000002




Error-rate Day 72   : 0.018000000000000016




Error-rate Day 73   : 0.019549999999999956




Error-rate Day 74   : 0.01759999999999995




Error-rate Day 75   : 0.029750000000000054




Error-rate Day 76   : 0.02729999999999999




Error-rate Day 77   : 0.024399999999999977




Error-rate Day 78   : 0.025050000000000017




Error-rate Day 79   : 0.029449999999999976




Error-rate Day 80   : 0.02375000000000005




Error-rate Day 81   : 0.02554999999999996




Error-rate Day 82   : 0.01649999999999996




Error-rate Day 83   : 0.018399999999999972




Error-rate Day 84   : 0.02234999999999998




Error-rate Day 85   : 0.022800000000000042




Error-rate Day 86   : 0.018399999999999972




Error-rate Day 87   : 0.01715




Error-rate Day 88   : 0.01849999999999996




Error-rate Day 89   : 0.013900000000000023




Error-rate Day 90   : 0.011399999999999966




Error-rate Day 91   : 0.02070000000000005




Error-rate Day 92   : 0.020100000000000007




Error-rate Day 93   : 0.018850000000000033




Error-rate Day 94   : 0.021700000000000053




Error-rate Day 95   : 0.009800000000000031




Error-rate Day 96   : 0.017850000000000033




Error-rate Day 97   : 0.017549999999999955




Error-rate Day 98   : 0.023499999999999965




Error-rate Day 99   : 0.020100000000000007




Error-rate Day 100   : 0.021750000000000047




Error-rate Day 101   : 0.023299999999999987




Error-rate Day 102   : 0.024449999999999972




Error-rate Day 103   : 0.019100000000000006




Error-rate Day 104   : 0.019399999999999973




Error-rate Day 105   : 0.02585000000000004




Error-rate Day 106   : 0.021499999999999964




Error-rate Day 107   : 0.018449999999999966




Error-rate Day 108   : 0.029249999999999998




Error-rate Day 109   : 0.023950000000000027




Error-rate Day 110   : 0.018100000000000005




Error-rate Day 111   : 0.01859999999999995




Error-rate Day 112   : 0.021050000000000013




Error-rate Day 113   : 0.01980000000000004




Error-rate Day 114   : 0.01859999999999995




Error-rate Day 115   : 0.022050000000000014




Error-rate Day 116   : 0.03359999999999996




Error-rate Day 117   : 0.023900000000000032




Error-rate Day 118   : 0.02429999999999999
Error-rate Day 119   : 0.022599999999999953




In [7]:
X_curr_day = X[0]
X_curr_day.trace()
# 44.16392850597378

44.16392850597378

In [8]:
X_curr_day = X[1]
X_curr_day.trace()
#48.57159821360516

48.57159821360516