In [2]:
import scipy.io
data = scipy.io.loadmat('../../data/url.mat')

**create list for labels and data, where one entry is the data for the day with this index**

In [3]:
import numpy as np

num_of_days = 120
X, Y = [], []

for i in range(num_of_days):
    day_data = data["Day" + str(i)]
    X.append(day_data[0][0][0])
    Y.append(day_data[0][0][1])

**continous learn classifier**

### single urls

from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(random_state=123)

err = 0
for i in range(1): # looping through days
    
    # change X to row format for faster slicing row-wise.
    X_curr_day, Y_curr_day = X[i].tocsr(), Y[i]
    
    for j in range(X_curr_day.shape[0]): # looping through individual urls
        X_curr_url, Y_curr_url = X_curr_day[j,:], Y_curr_day[j] 
        
        if (j > 0):
            if (clf.predict(X_curr_url) != Y_curr_url):
                err = err + 1
        
        clf.partial_fit(X_curr_url, Y_curr_url, classes=list(range(2))) # Continous fitting of urls and label
print(err)

In [6]:
### url batches (n=100)

from sklearn.linear_model import Perceptron
from datetime import datetime

clf = Perceptron(random_state=123)
batch_size = 100
err = 0

start = datetime.now().time() # time object
print("start =", start)

for i in range(num_of_days): # looping through days
    
    if (i != 45):
        # change X to row format for faster slicing row-wise.
        fixed_features_amount = 150000 #150k features for training
        select_ind = np.arange(0, fixed_features_amount)
        X_curr_day = X[i][:,select_ind].tocsr()
    
        # split the data in slices of batch_size
        batches_amount = int(X_curr_day.shape[0] / batch_size)
        Y_curr_day = np.array_split(Y[i], batches_amount)
    
        for j in range(batches_amount): # looping through individual urls
            select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        
            X_curr_url_batch, Y_curr_url_batch = X_curr_day[select_ind,:], Y_curr_day[j] 
        
            # flatten y to 1d
            Y_curr_url_batch = Y_curr_url_batch.ravel()
        
            if (j > 0):
                Y_preds = clf.predict(X_curr_url_batch)
            
                for k in range(batch_size):
                    if(Y_preds[k] != Y_curr_url_batch[k]):
                        err = err + 1
        
    
            clf.partial_fit(X_curr_url_batch, Y_curr_url_batch, classes=list(range(2))) # Continous fitting of urls and label
        print(i)
        print(err / X_curr_day.shape[0])
        err = 0

end = datetime.now().time() # time object
print("end =", end)
    # batch size 100, first 15 days
    # 0.0486875 -> 0.0241 (0.02105 day14)
    
    # batch size 100, all days (except 45:broken)
    # 0.0486875 -> 0.0154 ()

start = 17:03:51.841272
0
0.0486875
1
0.0353
2
0.0306
3
0.0289
4
0.0323
5
0.02635
6
0.02515
7
0.03175
8
0.02925
9
0.03725
10
0.03095
11
0.0251
12
0.0274
13
0.02105
14
0.0241
15
0.0207
16
0.02285
17
0.0202
18
0.0229
19
0.01685
20
0.02295
21
0.02875
22
0.02455
23
0.02655
24
0.03065
25
0.0352
26
0.02105
27
0.02385
28
0.02665
29
0.0245
30
0.0311
31
0.0289
32
0.0261
33
0.0274
34
0.0004
35
0.0327
36
0.00175
37
0.0009
38
0.0393
39
0.0376
40
0.02995
41
0.02395
42
0.0284
43
0.02635
44
0.0147
46
0.02495
47
0.022
48
0.02805
49
0.0317
50
0.03185
51
0.02325
52
0.02715
53
0.0301
54
0.02085
55
0.0227
56
0.02315
57
0.01995
58
0.02005
59
0.01775
60
0.0212
61
0.01825
62
0.0219
63
0.0246
64
0.01895
65
0.0258
66
0.0201
67
0.02045
68
0.0209
69
0.0165
70
0.02055
71
0.026
72
0.02075
73
0.0239
74
0.0197
75
0.02265
76
0.0225
77
0.0175
78
0.02075
79
0.0269
80
0.02595
81
0.02275
82
0.0129
83
0.0142
84
0.0179
85
0.0199
86
0.0148
87
0.0147
88
0.01525
89
0.0116
90
0.00955
91
0.014
92
0.01935
93
0.01765
94
0.01615
9

IndexError: list index out of range

**evaluate accuracy**

In [5]:
from sklearn.metrics import accuracy_score

Y_test_preds = []
for j in range(len(X_test)): ## Looping through test batches for making predictions
    Y_preds = clf.predict(X_test[j])
    Y_test_preds.extend(Y_preds.tolist())

# flatten y_test
Y_test = [element for sublist in Y_test for element in sublist]
print("Test Accuracy      : {}".format(accuracy_score(Y_test, Y_test_preds)))

NameError: name 'X_test' is not defined

In [4]:
# incremental learns classifier (must have partial_fit() function)
# returns an array of cumulative error rates for each day
def learn_incremental(clf, batch_size = 1000):
    
    error_rates = []
    num_of_days = 120
    err = 0
    
    for curr_day in range(num_of_days): # looping through days
    
        # change X to row format for faster slicing row-wise.
        X_curr_day = X[i].tocsr()
    
        # split the data in slices of batch_size
        batches_amount = int(X_curr_day.shape[0] / batch_size)
        Y_curr_day = np.array_split(Y[i], batches_amount)
    
        for j in range(batches_amount): # looping through individual urls
            select_ind = np.arange(j * batch_size, (j+1) * batch_size)
        
            X_curr_url_batch, Y_curr_url_batch = X_curr_day[select_ind,:], Y_curr_day[j] 
        
    
            # flatten y to 1d
            Y_curr_url_batch = Y_curr_url_batch.ravel()
        
            if (j > 0):
                Y_preds = clf.predict(X_curr_url_batch)
            
                for k in range(batch_size):
                    if(Y_preds[k] != Y_curr_url_batch[k]):
                        err = err + 1
        
    
            clf.partial_fit(X_curr_url_batch, Y_curr_url_batch, classes=list(range(2))) # Continous fitting of urls and label
        
        print("Error-rate Day {}   : {}".format(curr_day, err / X_curr_day.shape[0]))
        error_rates.append(err / X_curr_day.shape[0])
        err = 0
    return error_rates

In [5]:
from sklearn.linear_model import Perceptron

clf = Perceptron(random_state = 123)

error_rates = learn_incremental(clf, 100)

Error-rate Day 0   : 0.0426
Error-rate Day 1   : 0.0254
Error-rate Day 2   : 0.02015
Error-rate Day 3   : 0.0162
Error-rate Day 4   : 0.01475
Error-rate Day 5   : 0.0121
Error-rate Day 6   : 0.0126
Error-rate Day 7   : 0.01075
Error-rate Day 8   : 0.01045
Error-rate Day 9   : 0.0087
Error-rate Day 10   : 0.008
Error-rate Day 11   : 0.01
Error-rate Day 12   : 0.00805
Error-rate Day 13   : 0.0053
Error-rate Day 14   : 0.0064
Error-rate Day 15   : 0.0053
Error-rate Day 16   : 0.00435
Error-rate Day 17   : 0.0058
Error-rate Day 18   : 0.0049
Error-rate Day 19   : 0.00525
Error-rate Day 20   : 0.004
Error-rate Day 21   : 0.00385
Error-rate Day 22   : 0.0027
Error-rate Day 23   : 0.00365
Error-rate Day 24   : 0.00435
Error-rate Day 25   : 0.00315
Error-rate Day 26   : 0.0046
Error-rate Day 27   : 0.00405
Error-rate Day 28   : 0.00295
Error-rate Day 29   : 0.00305
Error-rate Day 30   : 0.00205
Error-rate Day 31   : 0.0041
Error-rate Day 32   : 0.00435
Error-rate Day 33   : 0.00315
Error-rate 