In [1]:
import os, time, csv
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix

In [6]:
def explore(target_file, separator=',', fieldnames=None, binary_features=list(), numeric_features=list(), max_rows=20000):
    """
    Create MinMaxScaler and DictVectorizer from online stream
    
    Parameters:
    target file: stream file
    separator: delimiter
    fieldnames: field label (optional)
    binary_features: list of qualitative features
    numeric_features: list of numeric features
    max_rows: a number of row from stream file (None is possible)
    """
    features = dict()
    min_max = dict()
    vectorizer = DictVectorizer(sparse=False)
    scaler = MinMaxScaler()
    with open(target_file, 'r') as R:
        iterator = csv.DictReader(R, fieldnames, delimiter=separator)
        for n, row in enumerate(iterator):
            #data exploration
            for k,v in row.items():
                if k in binary_features:
                    if k+'_'+v not in features:
                        features[k+'_'+v] = 0
                elif k in numeric_features:
                    v = float(v)
                    if k not in features:
                        features[k] = 0
                        min_max[k] = [v,v]
                    else:
                        if v < min_max[k][0]:
                            min_max[k][0] = v
                        elif v > min_max[k][1]:
                            min_max[k][1] = v
                else:
                    pass
            if max_rows and n > max_rows:
                break

    vectorizer.fit([features])
    A = vectorizer.transform([{f:0 if f not in min_max else min_max[f][0] for f in vectorizer.feature_names_},\
                            {f:1 if f not in min_max else min_max[f][1] for f in vectorizer.feature_names_}])
    scaler.fit(A)
    return vectorizer, scaler

    

In [17]:
def pull_examples(target_file, vectorizer, binary_features, numeric_features, target, min_max=None, separator=',', \
                 fieldnames=None, sparse=True):
    """
    return generator from online stream
    """
    with open(target_file, 'r') as R:
        iterator = csv.DictReader(R, fieldnames, delimiter=separator)
        for n, row in enumerate(iterator):
            
            #data processing
            stream_row = {}
            response = np.array([float(row[target])])
            for k,v in row.items():
                if k in binary_features:
                    stream_row[k+'_'+v]=1.0
                else:
                    if k in numeric_features:
                        stream_row[k] = float(v)
            if min_max:
                features = min_max.transform(vectorizer.transform([stream_row]))
            else:
                features = vectorizer.transform([stream_row])
            if sparse:
                yield(csr_matrix(features),response,n)
            else:
                yield(features, response, n)

In [18]:
source = '/datasets/bikesharing/hour.csv'
local_path = os.getcwd()
b_vars = ['holiday', 'hr', 'mnth', 'season', 'weathersit', 'weekday', 'workingday', 'yr']
n_vars = ['hum', 'temp', 'atemp', 'windspeed']
std_row, min_max = explore(target_file=local_path+'/'+source, binary_features=b_vars, numeric_features=n_vars)
print('Feature: ')
for f,mv,mx in zip(std_row.feature_names_, min_max.data_min_, min_max.data_max_):
    print('%s:[%0.2f,%0.2f] ' %(f,mv,mx))

Feature: 
atemp:[0.00,1.00] 
holiday_0:[0.00,1.00] 
holiday_1:[0.00,1.00] 
hr_0:[0.00,1.00] 
hr_1:[0.00,1.00] 
hr_10:[0.00,1.00] 
hr_11:[0.00,1.00] 
hr_12:[0.00,1.00] 
hr_13:[0.00,1.00] 
hr_14:[0.00,1.00] 
hr_15:[0.00,1.00] 
hr_16:[0.00,1.00] 
hr_17:[0.00,1.00] 
hr_18:[0.00,1.00] 
hr_19:[0.00,1.00] 
hr_2:[0.00,1.00] 
hr_20:[0.00,1.00] 
hr_21:[0.00,1.00] 
hr_22:[0.00,1.00] 
hr_23:[0.00,1.00] 
hr_3:[0.00,1.00] 
hr_4:[0.00,1.00] 
hr_5:[0.00,1.00] 
hr_6:[0.00,1.00] 
hr_7:[0.00,1.00] 
hr_8:[0.00,1.00] 
hr_9:[0.00,1.00] 
hum:[0.00,1.00] 
mnth_1:[0.00,1.00] 
mnth_10:[0.00,1.00] 
mnth_11:[0.00,1.00] 
mnth_12:[0.00,1.00] 
mnth_2:[0.00,1.00] 
mnth_3:[0.00,1.00] 
mnth_4:[0.00,1.00] 
mnth_5:[0.00,1.00] 
mnth_6:[0.00,1.00] 
mnth_7:[0.00,1.00] 
mnth_8:[0.00,1.00] 
mnth_9:[0.00,1.00] 
season_1:[0.00,1.00] 
season_2:[0.00,1.00] 
season_3:[0.00,1.00] 
season_4:[0.00,1.00] 
temp:[0.02,1.00] 
weathersit_1:[0.00,1.00] 
weathersit_2:[0.00,1.00] 
weathersit_3:[0.00,1.00] 
weathersit_4:[0.00,1.00] 
weekday_0

In [24]:
from sklearn.linear_model import SGDRegressor
SGD = SGDRegressor(loss='epsilon_insensitive', epsilon=0.001, penalty=None, random_state=1, average=True)
val_rmse = 0
val_rmsle = 0
predictions_start = 16000

def apply_log(x): return np.log(x + 1.0)
def apply_exp(x): return np.exp(x) - 1.0

for x,y,n in pull_examples(target_file=local_path+'/'+source, vectorizer=std_row, \
                          min_max=min_max, binary_features=b_vars, numeric_features=n_vars, target='cnt'):
    y_log = apply_log(y)
    #machine learning
    if (n+1) >= predictions_start:
        #holdout after N phase
        predicted = SGD.predict(x)
        val_rmse += (apply_exp(predicted)-y)**2
        val_rmsle += (predicted - y_log)**2
        if(n-predictions_start+1)% 250 == 0 and (n+1) > predictions_start:
            print(n, end='')
            print('%s holdout RMSE: %0.3f' % (time.strftime('%X'), (val_rmse/float(n-predictions_start+1))**0.5), end='')
            print('holdout RMSLE: %0.3f'%((val_rmsle/float(n-predictions_start+1))**0.5))
    else:
        #learning phase
        SGD.partial_fit(x,y_log)
print('%s FINAL holdout RMSE: %0.3f' % (time.strftime('X'), (val_rmse/float(n-predictions_start+1))**0.5))
print('%s FINAL holdout RMSLE: %0.3f' % (time.strftime('X'), (val_rmsle/float(n-predictions_start+1))**0.5))



1624914:57:04 holdout RMSE: 276.604holdout RMSLE: 1.796
1649914:57:04 holdout RMSE: 250.419holdout RMSLE: 1.706
1674914:57:04 holdout RMSE: 250.639holdout RMSLE: 1.694
1699914:57:04 holdout RMSE: 249.561holdout RMSLE: 1.702
1724914:57:05 holdout RMSE: 234.840holdout RMSLE: 1.640
X FINAL holdout RMSE: 224.404
X FINAL holdout RMSLE: 1.594


In [28]:
source = 'datasets/shuffled_covtype.data'
local_path = os.getcwd()
n_vars = ['var_'+'0'*int(j<10)+str(j) for j in range(54)]
std_row, min_max = explore(target_file=local_path+'/'+source, binary_features=list(), \
                           fieldnames=n_vars+['covertype'], numeric_features=n_vars, \
                          max_rows=50000)
print('Feature: ')
for f,mv,mx in zip(std_row.feature_names_, min_max.data_min_, min_max.data_max_):
    print('%s:[%0.2f,%0.2f]' % (f,mv,mx))

Feature: 
var_00:[1860.00,3845.00]
var_01:[0.00,360.00]
var_02:[0.00,54.00]
var_03:[0.00,1348.00]
var_04:[-166.00,589.00]
var_05:[0.00,7097.00]
var_06:[0.00,254.00]
var_07:[87.00,254.00]
var_08:[0.00,253.00]
var_09:[0.00,7110.00]
var_10:[0.00,1.00]
var_11:[0.00,1.00]
var_12:[0.00,1.00]
var_13:[0.00,1.00]
var_14:[0.00,1.00]
var_15:[0.00,1.00]
var_16:[0.00,1.00]
var_17:[0.00,1.00]
var_18:[0.00,1.00]
var_19:[0.00,1.00]
var_20:[0.00,1.00]
var_21:[0.00,1.00]
var_22:[0.00,1.00]
var_23:[0.00,1.00]
var_24:[0.00,1.00]
var_25:[0.00,1.00]
var_26:[0.00,1.00]
var_27:[0.00,1.00]
var_28:[0.00,0.00]
var_29:[0.00,1.00]
var_30:[0.00,1.00]
var_31:[0.00,1.00]
var_32:[0.00,1.00]
var_33:[0.00,1.00]
var_34:[0.00,1.00]
var_35:[0.00,1.00]
var_36:[0.00,1.00]
var_37:[0.00,1.00]
var_38:[0.00,1.00]
var_39:[0.00,1.00]
var_40:[0.00,1.00]
var_41:[0.00,1.00]
var_42:[0.00,1.00]
var_43:[0.00,1.00]
var_44:[0.00,1.00]
var_45:[0.00,1.00]
var_46:[0.00,1.00]
var_47:[0.00,1.00]
var_48:[0.00,1.00]
var_49:[0.00,1.00]
var_50:[0.

In [33]:
from sklearn.linear_model import SGDClassifier
SGD = SGDClassifier(loss='hinge', penalty=None, random_state=1, average=True)
accuracy = 0
accuracy_record = list()
predictions_start = 50
sample = 5000
early_stop = 50000

for x,y,n in pull_examples(target_file=local_path+'/'+source, vectorizer=std_row, min_max=min_max, \
                          binary_features = list(), numeric_features=n_vars, 
                           fieldnames=n_vars+['covertype'], target='covertype'):
    #learning phase
    if n>predictions_start:
        accuracy += int(int(SGD.predict(x))==y[0])
        if n % sample == 0:
            accuracy_record.append(accuracy/float(sample))
            print('%s Progressive accuracy at example %i: %0.3f' % (time.strftime('%X'), n, np.mean(accuracy_record[-sample:])))
            accuracy = 0
    if early_stop and n >= early_stop:
        break
    SGD.partial_fit(x,y,classes=range(1,8))
            



15:20:12 Progressive accuracy at example 5000: 0.656
15:20:32 Progressive accuracy at example 10000: 0.675
15:20:51 Progressive accuracy at example 15000: 0.685
15:21:10 Progressive accuracy at example 20000: 0.691
15:21:28 Progressive accuracy at example 25000: 0.697
15:21:47 Progressive accuracy at example 30000: 0.698
15:22:06 Progressive accuracy at example 35000: 0.699
15:22:24 Progressive accuracy at example 40000: 0.701
15:22:44 Progressive accuracy at example 45000: 0.702
15:23:03 Progressive accuracy at example 50000: 0.703


In [34]:
!ipython nbconvert --to=python SGD2SVM.ipynb

[NbConvertApp] Converting notebook SGD2SVM.ipynb to python
[NbConvertApp] Writing 6356 bytes to SGD2SVM.py
