In [2]:
import scipy.io
data = scipy.io.loadmat('data/url.mat')

**create list for labels and data, where one entry is the data for the day with this index**

In [3]:
import numpy as np

num_of_days = 120
X, Y = [], []

for i in range(num_of_days):
    day_data = data["Day" + str(i)]
    X.append(day_data[0][0][0])
    Y.append(day_data[0][0][1])

**continous learn classifier**

### single urls

from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(random_state=123)

err = 0
for i in range(1): # looping through days
    
    # change X to row format for faster slicing row-wise.
    X_curr_day, Y_curr_day = X[i].tocsr(), Y[i]
    
    for j in range(X_curr_day.shape[0]): # looping through individual urls
        X_curr_url, Y_curr_url = X_curr_day[j,:], Y_curr_day[j] 
        
        if (j > 0):
            if (clf.predict(X_curr_url) != Y_curr_url):
                err = err + 1
        
        clf.partial_fit(X_curr_url, Y_curr_url, classes=list(range(2))) # Continous fitting of urls and label
print(err)

In [10]:
### url batches (n=100)
from datetime import datetime
from sklearn.linear_model import PassiveAggressiveClassifier

clf = PassiveAggressiveClassifier(random_state=123)
batch_size = 100
err = 0

start = datetime.now().time() # time object
print("start =", start)

for i in range(num_of_days): # looping through days
    
   if (i != 45):
        # change X to row format for faster slicing row-wise.
        X_curr_day = X[i].tocsr()
    
        # split the data in slices of batch_size
        batches_amount = int(X_curr_day.shape[0] / batch_size)
        Y_curr_day = np.array_split(Y[i], batches_amount)
    
        for j in range(batches_amount): # looping through individual urls
            select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        
            X_curr_url_batch, Y_curr_url_batch = X_curr_day[select_ind,:], Y_curr_day[j] 
        
            # flatten y to 1d
            Y_curr_url_batch = Y_curr_url_batch.ravel()
        
            if (j > 0):
                Y_preds = clf.predict(X_curr_url_batch)
            
                for k in range(batch_size):
                    if(Y_preds[k] != Y_curr_url_batch[k]):
                        err = err + 1
        
    
            clf.partial_fit(X_curr_url_batch, Y_curr_url_batch, classes=list(range(2))) # Continous fitting of urls and label
        print(i)
        print(err / X_curr_day.shape[0])
        err = 0
end = datetime.now().time() # time object
print("end =", end)
    # batch size 100, first 15 days
    # 0.033125 -> 0.021 (lowest day14 0.0162)
    
    # batch size 100 all days (exc45) -> 6.5min
    # 0.033125 -> 0.01335 (0.0064 96)

start = 17:04:31.220078
0
0.033125
1
0.0301
2
0.02805
3
0.02535
4
0.02705
5
0.02425
6
0.0198
7
0.02775
8
0.02675
9
0.03105
10
0.02655
11
0.0218
12
0.02445
13
0.0162
14
0.021
15
0.01595
16
0.02005
17
0.0173
18
0.0192
19
0.0162
20
0.02055
21
0.02235
22
0.0217
23
0.02195
24
0.02595
25
0.02795
26
0.0161
27
0.01945
28
0.02275
29
0.02275
30
0.02845
31
0.025
32
0.02195
33
0.0224
34
0.00015
35
0.02955
36
0.0014
37
0.00035
38
0.03415
39
0.0354
40
0.02505
41
0.0193
42
0.027
43
0.02075
44
0.01345
46
0.01915
47
0.01925
48
0.02545
49
0.0267
50
0.02675
51
0.01975
52
0.0231
53
0.02345
54
0.01855
55
0.01835
56
0.02
57
0.0162
58
0.0163
59
0.0146
60
0.016
61
0.0173
62
0.01835
63
0.0211
64
0.014
65
0.02005
66
0.0173
67
0.0163
68
0.0161
69
0.01295
70
0.018
71
0.02235
72
0.0182
73
0.01955
74
0.0163
75
0.0191
76
0.0202
77
0.01495
78
0.0178
79
0.0236
80
0.02065
81
0.01845
82
0.01
83
0.01045
84
0.0144
85
0.01725
86
0.01305
87
0.0112
88
0.013
89
0.0095
90
0.0084
91
0.0111
92
0.0145
93
0.01435
94
0.0139
95
0.00

**evaluate accuracy**

In [None]:
from sklearn.metrics import accuracy_score

Y_test_preds = []
for j in range(len(X_test)): ## Looping through test batches for making predictions
    Y_preds = clf.predict(X_test[j])
    Y_test_preds.extend(Y_preds.tolist())

# flatten y_test
Y_test = [element for sublist in Y_test for element in sublist]
print("Test Accuracy      : {}".format(accuracy_score(Y_test, Y_test_preds)))