In [1]:
import scipy.io
data = scipy.io.loadmat('data/url.mat')

**create list for labels and data, where one entry is the data for the day with this index**

In [11]:
import numpy as np

num_of_days = 120
X, Y = [], []

for i in range(num_of_days):
    day_data = data["Day" + str(i)]
    X.append(day_data[0][0][0])
    Y.append(day_data[0][0][1])

**continous learn classifier**

### single urls

from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(random_state=123)

err = 0
for i in range(1): # looping through days
    
    # change X to row format for faster slicing row-wise.
    X_curr_day, Y_curr_day = X[i].tocsr(), Y[i]
    
    for j in range(X_curr_day.shape[0]): # looping through individual urls
        X_curr_url, Y_curr_url = X_curr_day[j,:], Y_curr_day[j] 
        
        if (j > 0):
            if (clf.predict(X_curr_url) != Y_curr_url):
                err = err + 1
        
        clf.partial_fit(X_curr_url, Y_curr_url, classes=list(range(2))) # Continous fitting of urls and label
print(err)

In [18]:
### url batches (n=100)

from sklearn.linear_model import PassiveAggressiveClassifier
from datetime import datetime

clf = PassiveAggressiveClassifier(random_state=123)
batch_size = 100
err = 0

start = datetime.now().time() # time object
print("start =", start)

for i in range(num_of_days): # looping through days
    
    if (i != 45):
        # change X to row format for faster slicing row-wise.
        fixed_features_amount = 150000 #150k features for training
        select_ind = np.arange(0, fixed_features_amount)
        X_curr_day = X[i][:,select_ind].tocsr()
    
        # split the data in slices of batch_size
        batches_amount = int(X_curr_day.shape[0] / batch_size)
        Y_curr_day = np.array_split(Y[i], batches_amount)
    
        for j in range(batches_amount): # looping through individual urls
            select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        
            X_curr_url_batch, Y_curr_url_batch = X_curr_day[select_ind,:], Y_curr_day[j] 
        
            # flatten y to 1d
            Y_curr_url_batch = Y_curr_url_batch.ravel()
        
            if (j > 0):
                Y_preds = clf.predict(X_curr_url_batch)
            
                for k in range(batch_size):
                    if(Y_preds[k] != Y_curr_url_batch[k]):
                        err = err + 1
        
    
            clf.partial_fit(X_curr_url_batch, Y_curr_url_batch, classes=list(range(2))) # Continous fitting of urls and label
        print(i)
        print(err / X_curr_day.shape[0])
        err = 0

end = datetime.now().time() # time object
print("end =", end)
    # batch size 100, first 15 days
    # 0.035 -> 0.02155 (lowest day 14: 0.0155)
    # batch size 100, all days (except 45:broken)
    # 0.035 -> 0.0124 (0.0119 day110)

0
0.035
1
0.02915
2
0.0266
3
0.0256
4
0.0279
5
0.02225
6
0.01915
7
0.0278
8
0.02735
9
0.03105
10
0.02725
11
0.02215
12
0.02345
13
0.0155
14
0.02155
15
0.01745
16
0.019
17
0.0174
18
0.0196
19
0.01575
20
0.02015
21
0.0237
22
0.02185
23
0.02335
24
0.02615
25
0.0294
26
0.01795
27
0.02035
28
0.02255
29
0.0228
30
0.02955
31
0.0239
32
0.02285
33
0.02325
34
0.00025
35
0.02905
36
0.00105
37
0.0004
38
0.0348
39
0.0348
40
0.02635
41
0.01975
42
0.0267
43
0.0219
44
0.0138
46
0.023
47
0.01985
48
0.02555
49
0.02805
50
0.02685
51
0.02075
52
0.02415
53
0.02615
54
0.01995
55
0.0198
56
0.02175
57
0.01895
58
0.01685
59
0.01595
60
0.0183
61
0.0173
62
0.02035
63
0.0213
64
0.01655
65
0.0222
66
0.01835
67
0.01795
68
0.01835
69
0.01445
70
0.01915
71
0.02425
72
0.01935
73
0.02135
74
0.017
75
0.0197
76
0.02165
77
0.0171
78
0.01865
79
0.02575
80
0.02185
81
0.0198
82
0.012
83
0.01205
84
0.0159
85
0.01785
86
0.01335
87
0.01265
88
0.013
89
0.0113
90
0.0082
91
0.012
92
0.0158
93
0.0164
94
0.0148
95
0.00755
96
0.01115