In [2]:
import scipy.io
data = scipy.io.loadmat('../../data/url.mat')

**create list for labels and data, where one entry is the data for the day with this index**

In [3]:
import numpy as np

num_of_days = 120
X, Y = [], []

for i in range(num_of_days):
    day_data = data["Day" + str(i)]
    X.append(day_data[0][0][0])
    Y.append(day_data[0][0][1])

**continous learn classifier**

### single urls

from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(random_state=123)

err = 0
for i in range(1): # looping through days
    
    # change X to row format for faster slicing row-wise.
    X_curr_day, Y_curr_day = X[i].tocsr(), Y[i]
    
    for j in range(X_curr_day.shape[0]): # looping through individual urls
        X_curr_url, Y_curr_url = X_curr_day[j,:], Y_curr_day[j] 
        
        if (j > 0):
            if (clf.predict(X_curr_url) != Y_curr_url):
                err = err + 1
        
        clf.partial_fit(X_curr_url, Y_curr_url, classes=list(range(2))) # Continous fitting of urls and label
print(err)

In [4]:
### url batches (n=100)
from datetime import datetime
from sklearn.linear_model import PassiveAggressiveClassifier

clf = PassiveAggressiveClassifier(random_state=123)
batch_size = 1000
err = 0

start = datetime.now().time() # time object
print("start =", start)

for i in range(num_of_days): # looping through days
    
   if (i != 45):
        # change X to row format for faster slicing row-wise.
        X_curr_day = X[i].tocsr()
    
        # split the data in slices of batch_size
        batches_amount = int(X_curr_day.shape[0] / batch_size)
        Y_curr_day = np.array_split(Y[i], batches_amount)
    
        for j in range(batches_amount): # looping through individual urls
            select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        
            X_curr_url_batch, Y_curr_url_batch = X_curr_day[select_ind,:], Y_curr_day[j] 
        
            # flatten y to 1d
            Y_curr_url_batch = Y_curr_url_batch.ravel()
        
            if (j > 0):
                Y_preds = clf.predict(X_curr_url_batch)
            
                for k in range(batch_size):
                    if(Y_preds[k] != Y_curr_url_batch[k]):
                        err = err + 1
        
    
            clf.partial_fit(X_curr_url_batch, Y_curr_url_batch, classes=list(range(2))) # Continous fitting of urls and label
        
        print("Error-rate Day {}   : {}".format(i,err / X_curr_day.shape[0]))
        #print(i)
        #print(err / X_curr_day.shape[0])
        err = 0
end = datetime.now().time() # time object
print("end =", end)
    # batch size 100, first 15 days
    # 0.033125 -> 0.021 (lowest day14 0.0162)
    
    # batch size 100 all days (exc45) -> 6.5min
    # 0.033125 -> 0.01335 (0.0064 96)

start = 16:51:59.129908
Error-rate Day 0   : 0.039625
Error-rate Day 1   : 0.0307
Error-rate Day 2   : 0.02155
Error-rate Day 3   : 0.02515
Error-rate Day 4   : 0.02945
Error-rate Day 5   : 0.02165
Error-rate Day 6   : 0.01945
Error-rate Day 7   : 0.0309
Error-rate Day 8   : 0.0322
Error-rate Day 9   : 0.0329
Error-rate Day 10   : 0.0268
Error-rate Day 11   : 0.01985
Error-rate Day 12   : 0.0221
Error-rate Day 13   : 0.01675
Error-rate Day 14   : 0.02635
Error-rate Day 15   : 0.01985
Error-rate Day 16   : 0.02045
Error-rate Day 17   : 0.01595
Error-rate Day 18   : 0.0194
Error-rate Day 19   : 0.01875
Error-rate Day 20   : 0.0195
Error-rate Day 21   : 0.0209
Error-rate Day 22   : 0.0199
Error-rate Day 23   : 0.02135
Error-rate Day 24   : 0.02635
Error-rate Day 25   : 0.0327
Error-rate Day 26   : 0.0192
Error-rate Day 27   : 0.01715
Error-rate Day 28   : 0.0275
Error-rate Day 29   : 0.0219
Error-rate Day 30   : 0.03205
Error-rate Day 31   : 0.02915
Error-rate Day 32   : 0.02815
Error-rat

**evaluate accuracy**

from sklearn.metrics import accuracy_score

Y_test_preds = []
for j in range(len(X_test)): ## Looping through test batches for making predictions
    Y_preds = clf.predict(X_test[j])
    Y_test_preds.extend(Y_preds.tolist())

# flatten y_test
Y_test = [element for sublist in Y_test for element in sublist]
print("Test Accuracy      : {}".format(accuracy_score(Y_test, Y_test_preds)))