In [55]:
import pandas as pd
import numpy as np
import operator
import time
from sklearn.metrics import roc_auc_score

In [56]:
limit = 1000 * 1000 # it is important to be 1000 * 1000
f = pd.read_csv('./avazu/train.csv', nrows=limit)
f.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [57]:
np.random.seed(123)
validation_indexes = np.random.choice([0, 1], p = [0.8, 0.2], size=10**8)

In [58]:
target_name = 'click'
target_index = 1

important_features = []
important_features_index = []

features_name = f.columns[3:]
for index, col in enumerate(features_name):
    if len(set(f[col])) < 3000:
        important_features.append(col)
        important_features_index.append(index + 3)

important_features_postfix = list(map(operator.add, important_features, '_'*len(important_features)))

In [59]:
def GetFeatures(features, important_features_postfix):
    return ' '.join(map(operator.__add__, *(important_features_postfix, features)))

In [60]:
def CreateVWFile(inputh_path, ouput_path, ouput_path_validation, 
                 cat_features_names, cat_features_index,
                 num_features_names, num_features_index,
                 target_index, validation_indexes):
    cat_features_postfix = list(map(operator.add, cat_features_names, '_'*len(cat_features_names)))
    num_features_postfix = list(map(operator.add, num_features_names, ':'*len(num_features_names)))
    train = open(ouput_path, 'w')
    validation = open(ouput_path_validation, 'w')
    start = time.time()
    targets = []
    targets_validation = []
    for index, row in enumerate(open(inputh_path, 'r')):
        if not index:
            continue
        updated_row = np.array(row.rstrip().split(','))
        cat_features = list(updated_row[cat_features_index])
        cat_features = GetFeatures(cat_features, cat_features_postfix)
        
        num_features = list(updated_row[num_features_index])
        num_features = GetFeatures(num_features, num_features_postfix)

        target = updated_row[target_index]

        if index and index % (100 * 1000) == 0:
            print(time.time() - start, index)
            start = time.time()
            break

        if validation_indexes[index]:
            validation.write(str(int(target) * 2 - 1) + ' |a ' + cat_features + '\n' + num_features)
            targets_validation.append(int(target) * 2 - 1)
        else:
            train.write(str(int(target) * 2 - 1) + ' |a ' + cat_features + '\n' + num_features)
            targets.append(int(target) * 2 - 1)
    train.close()
    validation.close()
    return targets, targets_validation

In [41]:
a1, a2 = CreateVWFile('./avazu/train.csv', './avazu/train_100k.vw', './avazu/validation_100k.vw', 
                      important_features, important_features_index, 
                      [], [],
                      target_index, validation_indexes)

(3.527111053466797, 100000)


In [61]:
def CreateYasliFile(inputh_path, ouput_path, ouput_path_validation, 
                    cat_features_names, cat_features_index,
                    num_features_names, num_features_index,
                    target_index, validation_indexes):
    cat_features_postfix = list(map(operator.add, cat_features_names, '_'*len(cat_features_names)))
    train = open(ouput_path, 'w')
    validation = open(ouput_path_validation, 'w')
    start = time.time()
    targets = []
    targets_validation = []
    for index, row in enumerate(open(inputh_path, 'r')):
        if not index:
            continue
        updated_row = np.array(row.rstrip().split(','))
        cat_features = list(updated_row[cat_features_index])
        cat_features = GetFeatures(cat_features, cat_features_postfix)
        
        num_features = ' '.join(list(updated_row[num_features_index]))

        target = updated_row[target_index]

        if index and index % (100 * 1000) == 0:
            print(time.time() - start, index)
            start = time.time()
            break

        if validation_indexes[index]:
            validation.write(str(int(target) * 2 - 1) + ' ' + cat_features + '\n' + num_features)
            targets_validation.append(int(target) * 2 - 1)
        else:
            train.write(str(int(target) * 2 - 1) + ' ' + cat_features + '\n' + num_features)
            targets.append(int(target) * 2 - 1)
    train.close()
    validation.close()
    return targets, targets_validation

In [62]:
b1, b2 = CreateYasliFile('./avazu/train.csv', './avazu/train_100k.yasli', './avazu/validation_100k.yasli', 
                         important_features, important_features_index,
                         [], [],
                         target_index, validation_indexes)

(3.5091230869293213, 100000)


In [9]:
def CreateLibfmFile(inputh_path, ouput_path, ouput_path_validation, 
                    cat_features_names, cat_features_index,
                    num_features_names, num_features_index,
                    target_index, validation_indexes):

    limit = 2**18
    def GetHash(cat_feature):
        return str(hash(cat_feature) % limit)
    def GetFeatures(features, important_features_postfix):
        return list(map(operator.__add__, *(important_features_postfix, features)))
    cat_features_postfix = list(map(operator.add, cat_features_names, '_'*len(cat_features_names)))
    train = open(ouput_path, 'w')
    validation = open(ouput_path_validation, 'w')
    start = time.time()
    targets = []
    targets_validation = []
    for index, row in enumerate(open(inputh_path, 'r')):
        if not index:
            continue
        updated_row = np.array(row.rstrip().split(','))

        cat_features = list(updated_row[cat_features_index])
        cat_features_indexes = list(map(GetHash, GetFeatures(cat_features, cat_features_postfix)))
        cat_features = ' '.join(map(operator.add, cat_features_indexes, [':1'] *len(cat_features_indexes)))
        
        num_features = ' '.join(list(map(str, updated_row[num_features_index])))
#         num_features_index += limit
        target = updated_row[target_index]

        if index and index % (10 * 1000) == 0:
            print(time.time() - start, index)
            start = time.time()
            break

        if validation_indexes[index]:
            validation.write(str(int(target) * 2 - 1) + ' ' + cat_features + '\n' + num_features)
            targets_validation.append(int(target) * 2 - 1)
        else:
            train.write(str(int(target) * 2 - 1) + ' ' + cat_features + '\n' + num_features)
            targets.append(int(target) * 2 - 1)
    train.close()
    validation.close()
    return targets, targets_validation

In [472]:
c1, c2 = CreateLibfmFile('./avazu/train.csv', './avazu/train_10k.libfm', './avazu/validation_10k.libfm', 
                         important_features, important_features_index,
                         [], [],
                         target_index, validation_indexes)

(45.14754104614258, 1000000)
(51.23129606246948, 2000000)
(44.64572095870972, 3000000)
(42.833678007125854, 4000000)
(41.52256798744202, 5000000)
(42.37977480888367, 6000000)
(42.06505608558655, 7000000)
(41.93250799179077, 8000000)
(41.75618314743042, 9000000)
(41.01055383682251, 10000000)
(40.61489391326904, 11000000)
(45.574105978012085, 12000000)
(41.29886603355408, 13000000)
(40.716058015823364, 14000000)
(40.44871282577515, 15000000)
(40.70690298080444, 16000000)
(40.574328899383545, 17000000)
(40.490395069122314, 18000000)
(40.71908688545227, 19000000)
(41.17920994758606, 20000000)
(40.70496201515198, 21000000)
(41.19611406326294, 22000000)
(40.51869487762451, 23000000)
(40.38752102851868, 24000000)
(40.67609000205994, 25000000)
(40.467522859573364, 26000000)
(41.64821696281433, 27000000)
(40.78989601135254, 28000000)
(41.13175106048584, 29000000)
(45.99660110473633, 30000000)
(54.921775102615356, 31000000)
(59.35376000404358, 32000000)
(46.272695779800415, 33000000)
(46.1648759

In [10]:
def GetTargets(inputh_path, target_index, validation_indexes):
    start = time.time()
    targets = []
    targets_validation = []
    for index, row in enumerate(open(inputh_path, 'r')):
        if not index:
            continue
        updated_row = np.array(row.rstrip().split(','))
        target = updated_row[target_index]
        if index and index % (1000 * 1000) == 0:
            print(time.time() - start, index)
            start = time.time()

        if validation_indexes[index]:
            targets_validation.append(int(target) * 2 - 1)
        else:
            targets.append(int(target) * 2 - 1)
    return targets, targets_validation

In [11]:
_, avazu_target = GetTargets('./avazu/train.csv', 1, validation_indexes)

(6.9391930103302, 1000000)
(6.936805963516235, 2000000)
(8.058295011520386, 3000000)
(7.5221991539001465, 4000000)
(7.1364428997039795, 5000000)
(7.6302409172058105, 6000000)
(7.079992055892944, 7000000)
(7.079240083694458, 8000000)
(6.854964971542358, 9000000)
(6.878206968307495, 10000000)
(8.122447967529297, 11000000)
(8.343829154968262, 12000000)
(8.461267948150635, 13000000)
(7.418223142623901, 14000000)
(7.10387396812439, 15000000)
(7.688402891159058, 16000000)
(7.1128089427948, 17000000)
(7.045474052429199, 18000000)
(7.100030183792114, 19000000)
(7.255903005599976, 20000000)
(7.013818979263306, 21000000)
(6.893393039703369, 22000000)
(7.449028015136719, 23000000)
(8.817224979400635, 24000000)
(7.2869789600372314, 25000000)
(6.917140007019043, 26000000)
(6.89143180847168, 27000000)
(6.990386009216309, 28000000)
(7.251456022262573, 29000000)
(6.871216058731079, 30000000)
(6.958004951477051, 31000000)
(6.921903848648071, 32000000)
(7.968425035476685, 33000000)
(8.55829906463623, 34

In [63]:
import pandas as pd
import numpy as np

In [64]:
limit = 20 * 1000 # it is important to be 1000 * 1000
# limit = 10
f = pd.read_csv('./criteo/train.txt', nrows=limit, header=None, sep='\t')
f.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [65]:
ff = open('./criteo/train.txt')

In [66]:
def GetFeatures(features, important_features_postfix):
    return ' '.join(map(operator.__add__, *(important_features_postfix, features)))

In [67]:
np.random.seed(123)
validation_indexes = np.random.choice([0, 1], p = [0.8, 0.2], size=10**8)

In [68]:
target_index = 0
num_features = list(map(str, f.columns[1:1 + 13]))
num_features_index = list(map(int, num_features))

cat_features = list(map(str, f.columns[1 + 13:]))
cat_features_index = list(map(int, cat_features))

In [69]:
def CreateVWFile(inputh_path, ouput_path, ouput_path_validation, 
                 cat_features_names, cat_features_index,
                 num_features_names, num_features_index,
                 target_index, validation_indexes):
    cat_features_postfix = list(map(operator.add, cat_features_names, '_'*len(cat_features_names)))
    num_features_postfix = list(map(operator.add, num_features_names, ':'*len(num_features_names)))
    train = open(ouput_path, 'w')
    validation = open(ouput_path_validation, 'w')
    start = time.time()
    targets = []
    targets_validation = []

    for index, row in enumerate(open(inputh_path, 'r')):
        data = row.rstrip('\n').split('\t')
        for i, dat in enumerate(data):
            if not dat :
                data[i] = '0'
        updated_row = np.array(data)
        cat_features = list(updated_row[cat_features_index])
        cat_features = GetFeatures(cat_features, cat_features_postfix)
        
        num_features = list(updated_row[num_features_index])
        num_features = GetFeatures(num_features, num_features_postfix)

        target = updated_row[target_index]

        if index and index % (100 * 1000) == 0:
            print(time.time() - start, index)
            start = time.time()
            break

        if validation_indexes[index]:
            validation.write(str(int(target) * 2 - 1) + ' |a ' + cat_features + ' ' + num_features + '\n')
            targets_validation.append(int(target) * 2 - 1)
        else:
            train.write(str(int(target) * 2 - 1) + ' |a ' + cat_features + ' ' + num_features + '\n')
            targets.append(int(target) * 2 - 1)
    train.close()
    validation.close()
    return targets, targets_validation

In [70]:
a1, a2 = CreateVWFile('./criteo/train.txt', './criteo/train_100k.vw', './criteo/validation_100k.vw', 
                      cat_features, cat_features_index, 
                      num_features, num_features_index,
                      0, validation_indexes)

(5.668568134307861, 100000)


In [71]:
def CreateYasliFile(inputh_path, ouput_path, ouput_path_validation, 
                    cat_features_names, cat_features_index,
                    num_features_names, num_features_index,
                    target_index, validation_indexes):
    cat_features_postfix = list(map(operator.add, cat_features_names, '_'*len(cat_features_names)))
    num_features_postfix = list(map(operator.add, num_features_names, ':'*len(num_features_names)))
    train = open(ouput_path, 'w')
    validation = open(ouput_path_validation, 'w')
    start = time.time()
    targets = []
    targets_validation = []

    for index, row in enumerate(open(inputh_path, 'r')):
        data = row.rstrip('\n').split('\t')
        for i, dat in enumerate(data):
            if not dat :
                data[i] = '0'
        updated_row = np.array(data)
        cat_features = list(updated_row[cat_features_index])
        cat_features = GetFeatures(cat_features, cat_features_postfix)
        
        num_features = ' '.join(list(updated_row[num_features_index]))

        target = updated_row[target_index]

        if index and index % (100 * 1000) == 0:
            print(time.time() - start, index)
            start = time.time()
            break
    
        if validation_indexes[index]:
            validation.write(str(int(target) * 2 - 1) + ' ' + cat_features + ' ' + num_features + '\n')
            targets_validation.append(int(target) * 2 - 1)
        else:
            train.write(str(int(target) * 2 - 1) + ' ' + cat_features + ' ' + num_features + '\n')
            targets.append(int(target) * 2 - 1)
    train.close()
    validation.close()
    return targets, targets_validation

In [72]:
b1, b2 = CreateYasliFile('./criteo/train.txt', './criteo/train_100k.yasli', './criteo/validation_100k.yasli', 
                          cat_features, cat_features_index, 
                          num_features, num_features_index,
                          target_index, validation_indexes)

(5.103104114532471, 100000)


In [473]:
def CreateLibfmFile(inputh_path, ouput_path, ouput_path_validation, 
                    cat_features_names, cat_features_index,
                    num_features_names, num_features_index,
                    target_index, validation_indexes):

    limit = 2**18
    def GetHash(cat_feature):
        return str(hash(cat_feature) % limit)
    def GetFeatures(features, important_features_postfix):
        return list(map(operator.__add__, *(important_features_postfix, features)))
    cat_features_postfix = list(map(operator.add, cat_features_names, '_'*len(cat_features_names)))
    train = open(ouput_path, 'w')
    validation = open(ouput_path_validation, 'w')
    start = time.time()
    targets = []
    targets_validation = []
    for index, row in enumerate(open(inputh_path, 'r')):
        data = row.rstrip('\n').split('\t')
        for i, dat in enumerate(data):
            if not dat :
                data[i] = '0'

        updated_row = np.array(data)
        
        cat_features = list(updated_row[cat_features_index])
        cat_features_indexes = list(map(GetHash, GetFeatures(cat_features, cat_features_postfix)))
        cat_features = ' '.join(map(operator.add, cat_features_indexes, [':1'] *len(cat_features_indexes)))
    
        num_features = list(map(str, updated_row[num_features_index]))
        for i, num in enumerate(num_features):
            num_features[i] =  str(i + limit) + ":" + num_features[i]
        num_features = ' '.join(num_features)

        target = updated_row[target_index]

        if index and index % (10 * 1000) == 0:
            print(time.time() - start)
            start = time.time()
            break
        
        if validation_indexes[index]:
            validation.write(str(int(target) * 2 - 1) + ' ' + num_features + ' ' + cat_features + '\n')
            targets_validation.append(int(target) * 2 - 1)
        else:
            train.write(str(int(target) * 2 - 1) + ' ' + num_features + ' ' + cat_features + '\n')
            targets.append(int(target) * 2 - 1)
    train.close()
    validation.close()
    return targets, targets_validation

In [474]:
c1, c2 = CreateLibfmFile('./criteo/train.txt', './criteo/train_10k.libfm', './criteo/validation_10k.libfm', 
                         cat_features, cat_features_index, 
                         num_features, num_features_index,
                         0, validation_indexes)

74.411482811
73.5838170052
75.862760067
73.6947197914
74.1937818527
73.6952140331
82.8361589909
73.435710907
71.8721718788
72.2882580757
71.6774320602
71.9500849247
73.3443911076
73.3162648678
73.8380961418
71.9752931595
72.8174698353
84.7770400047
74.77684021
78.3871929646
77.832064867
77.1546089649
73.712043047
80.7305109501
71.7662730217
71.9426569939
72.1166028976
72.7862548828
72.9972419739
73.116549015
72.444683075
73.0444719791
73.5479888916
73.0207049847
72.171528101
72.7151391506
73.2076890469
73.5280168056
81.8605220318
75.9052507877
73.8122358322
81.3851690292
72.1847820282
78.5300550461
80.4794108868


In [13]:
def GetTargets(inputh_path, target_index, validation_indexes):
    start = time.time()
    targets = []
    targets_validation = []
    for index, row in enumerate(open(inputh_path, 'r')):
        if not index:
            continue
        updated_row = np.array(row.rstrip('\n').split('\t'))
        target = updated_row[target_index]
        if index and index % (1000 * 1000) == 0:
            print(time.time() - start, index)
            start = time.time()

        if validation_indexes[index]:
            targets_validation.append(int(target) * 2 - 1)
        else:
            targets.append(int(target) * 2 - 1)
    return targets, targets_validation

In [14]:
_, criteo_target = GetTargets('./criteo/train.txt', 0, validation_indexes)

(10.130358219146729, 1000000)
(9.894282817840576, 2000000)
(9.678725004196167, 3000000)
(9.673277854919434, 4000000)
(9.737694025039673, 5000000)
(9.691864967346191, 6000000)
(9.695143938064575, 7000000)
(10.31916880607605, 8000000)
(10.389625072479248, 9000000)
(10.25936484336853, 10000000)
(9.932084083557129, 11000000)
(9.817650079727173, 12000000)
(10.743032932281494, 13000000)
(10.650124788284302, 14000000)
(9.920145988464355, 15000000)
(9.377609968185425, 16000000)
(9.304805994033813, 17000000)
(9.309877157211304, 18000000)
(9.354204893112183, 19000000)
(9.288901090621948, 20000000)
(9.61383318901062, 21000000)
(9.512415885925293, 22000000)
(9.704297065734863, 23000000)
(9.400493144989014, 24000000)
(9.35575819015503, 25000000)
(9.323678970336914, 26000000)
(9.653927087783813, 27000000)
(10.011415958404541, 28000000)
(9.401466846466064, 29000000)
(9.313781023025513, 30000000)
(9.497884035110474, 31000000)
(9.369596004486084, 32000000)
(9.342489004135132, 33000000)
(9.7478268146514

In [237]:
def CreateLibfmFile(input_file, output_file):
    output_file = open(output_file, 'w')
    start_time = time.time()
    for index, row in enumerate(open(input_file, 'r')):
        row = row.split()
        target = row[0]

        a = list(map(lambda x: x.split(":"), row[1:]))
        a = list(map(lambda x: (int(x[0]) + 1, x[1]), a))
        a.sort()
        a = list(map(lambda x: (str(x[0]) + ":" + x[1]), a))
        a = target + ' ' + ' '.join(a) + '\n'
        output_file.write(a)
        if index and index % (1000 * 1000) == 0:
            print(time.time() - start_time)
            start_time = time.time()

In [238]:
CreateLibfmFile('./avazu/train_all.libfm', './avazu/train_all.liblinear')
CreateLibfmFile('./avazu/validation_all.libfm', './avazu/validation_all.liblinear')

57.2302539349
57.6894600391
56.8557348251
66.3206441402
59.7540838718
55.541424036
66.1293489933
57.6853411198
57.1294419765
62.2992448807
63.842523098
54.6064128876
56.3760969639
72.5067880154
54.1892199516
60.1930711269
58.3733570576
57.5445251465
60.1520941257
56.5791921616
59.0657479763
56.8568570614
60.2402667999
57.686948061
58.60927701
58.231400013
60.1232590675
57.4989650249
58.865336895
56.7204310894
56.0915489197
62.1256330013
62.4172229767
76.7445371151
78.4676039219
66.9702320099
57.0698301792
58.3657681942
54.6927268505
54.9816498756


In [239]:
CreateLibfmFile('./criteo/train_all.libfm', './criteo/train_all.liblinear')
CreateLibfmFile('./criteo/validation_all.libfm', './criteo/validation_all.liblinear')

111.47302103
109.090086937
108.871104956
121.915944815
118.548385859
106.793258905
107.190623045
108.111669064
122.399429083
109.63465786
107.289566994
106.829483986
108.095549107
110.806906939
107.25059104
107.282069921
106.772759914
107.103138924
107.409143925
107.933059931
107.418606997
108.130233049
107.054144859
106.94917202
109.422789097
111.133711815
110.548547983
108.262475014
107.794695854
107.175505877
108.331108093
112.017153978
121.570290089
148.145704031
129.187634945
121.011219978
125.186598063
116.468322039
117.356703997
122.388060093
118.299185991
109.595268011
107.003051996
107.523552179
107.560447931


#######################################################################################################################
#######################################################################################################################

In [18]:
!time vw ./avazu/train_all.vw --passes 1 --loss_function=logistic -f model_all.vw

final_regressor = model_all.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./avazu/train_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.693147 0.693147            1            1.0  -1.0000   0.0000       19
0.482606 0.272064            2            2.0  -1.0000  -1.1626       19
0.296443 0.110280            4            4.0  -1.0000  -2.3663       19
0.580312 0.864181            8            8.0   1.0000  -2.9779       19
0.365815 0.151319           16           16.0  -1.0000  -2.5014       19
0.480023 0.594230           32           32.0  -1.0000  -1.2970       19
0.516819 0.553615           64           64.0  -1.0000  -0.5657       19
0.557669 0.598519          128          128.0   1.0000  -1.5457       19
0.489216 0.420764          256          256.0   1.0000  -1.4401       19
0.438013 0.386810         

In [20]:
!time vw -d ./avazu/validation_all.vw -t -i model_all.vw -p ./avazu/prediction_all.vw

only testing
predictions = ./avazu/prediction_all.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./avazu/validation_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.090941 0.090941            1            1.0  -1.0000  -1.3016       19
1.121703 2.152466            2            2.0  -1.0000  -2.4671       19
3.467798 5.813893            4            4.0   1.0000  -1.7403       19
4.087231 4.706663            8            8.0  -1.0000  -1.8840       19
2.744582 1.401933           16           16.0  -1.0000  -2.4671       19
3.832865 4.921148           32           32.0  -1.0000  -2.1490       19
3.351291 2.869717           64           64.0  -1.0000  -1.0262       19
4.041445 4.731599          128          128.0  -1.0000  -2.2820       19
3.930331 3.819218          256          256.0  -1.0000  -1.8123       19

In [21]:
vw_predictions = open('./avazu/prediction_all.vw', 'r')
vw_predictions = np.array(list(map(float, vw_predictions.readlines())))

roc_auc_score(avazu_target, vw_predictions)

0.73566328012470628

#######################################################################################################################

In [73]:
!time vw ./avazu/train_all.vw --passes 1 --loss_function=logistic --sgd -f model_all.vw

final_regressor = model_all.vw
Num weight bits = 18
learning rate = 10
initial_t = 1
power_t = 0.5
using no cache
Reading datafile = ./avazu/train_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.693147 0.693147            1            1.0  -1.0000   0.0000       19
0.346574 0.000000            2            2.0  -1.0000 -50.0000       19
0.173287 0.000000            4            4.0  -1.0000 -50.0000       19
6.336643 12.500000            8            8.0   1.0000 -50.0000       19
3.182418 0.028192           16           16.0  -1.0000  -7.1761       19
4.081414 4.980411           32           32.0  -1.0000   3.9367       19
4.716325 5.351235           64           64.0  -1.0000 -11.8947       19
3.643035 2.569746          128          128.0   1.0000  -5.1183       19
2.439280 1.235526          256          256.0   1.0000   1.6826       19
1.718801 0.998322         

In [74]:
!time vw -d ./avazu/validation_all.vw -t -i model_all.vw -p ./avazu/prediction_all.vw

only testing
predictions = ./avazu/prediction_all.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./avazu/validation_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000077 0.000077            1            1.0  -1.0000  -1.0088       19
0.884768 1.769459            2            2.0  -1.0000  -2.3302       19
2.900619 4.916470            4            4.0   1.0000  -1.4877       19
3.875589 4.850560            8            8.0  -1.0000  -1.7541       19
2.486917 1.098245           16           16.0  -1.0000  -2.3302       19
3.012725 3.538533           32           32.0  -1.0000  -2.3902       19
2.607004 2.201284           64           64.0  -1.0000  -0.9752       19
2.658476 2.709947          128          128.0  -1.0000  -2.4419       19
2.763942 2.869408          256          256.0  -1.0000  -1.5757       19

In [76]:
vw_predictions = open('./avazu/prediction_all.vw', 'r')
vw_predictions = np.array(list(map(float, vw_predictions.readlines())))

roc_auc_score(avazu_target, vw_predictions)

0.72791907687211399

#######################################################################################################################

In [326]:
!time vw ./avazu/train_all.vw --passes 1 --loss_function=logistic --ftrl -f model_all.vw

final_regressor = model_all.vw
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./avazu/train_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.693147 0.693147            1            1.0  -1.0000   0.0000       19
0.676758 0.660369            2            2.0  -1.0000  -0.0667       19
0.648728 0.620697            4            4.0  -1.0000  -0.1737       19
0.660813 0.672898            8            8.0   1.0000  -0.2580       19
0.623971 0.587129           16           16.0  -1.0000  -0.2759       19
0.613744 0.603516           32           32.0  -1.0000  -0.4606       19
0.603649 0.593555           64           64.0  -1.0000  -0.5615       19
0.581349 0.559050          128          128.0   1.0000  -0.7407       19
0

In [327]:
!time vw -d ./avazu/validation_all.vw -t -i model_all.vw -p ./avazu/prediction_all.vw

only testing
predictions = ./avazu/prediction_all.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./avazu/validation_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.088831 0.088831            1            1.0  -1.0000  -1.2980       19
1.189688 2.290544            2            2.0  -1.0000  -2.5135       19
4.216285 7.242882            4            4.0   1.0000  -2.0185       19
4.434470 4.652655            8            8.0  -1.0000  -1.5929       19
2.994535 1.554600           16           16.0  -1.0000  -2.5135       19
3.203413 3.412290           32           32.0  -1.0000  -1.7730       19
2.801711 2.400009           64           64.0  -1.0000  -1.2936       19
2.560207 2.318703          128          128.0  -1.0000  -2.3237       19
2.401008 2.241809          256          256.0  -1.0000  -1.7721       19

In [328]:
vw_predictions = open('./avazu/prediction_all.vw', 'r')
vw_predictions = np.array(list(map(float, vw_predictions.readlines())))

roc_auc_score(avazu_target, vw_predictions)

0.73139280438921361

#######################################################################################################################

In [25]:
!time vw ./criteo/train_all.vw --passes 1 --loss_function=logistic -f model_all.vw

final_regressor = model_all.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./criteo/train_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.693147 0.693147            1            1.0  -1.0000   0.0000       38
0.485199 0.277251            2            2.0  -1.0000  -1.1410       38
0.253377 0.021554            4            4.0  -1.0000 -40.7937       29
0.421125 0.588874            8            8.0  -1.0000  -1.8924       37
0.597045 0.772964           16           16.0   1.0000  -2.4956       38
0.634793 0.672541           32           32.0  -1.0000  -0.7664       38
0.612975 0.591158           64           64.0  -1.0000  -0.8907       37
0.620177 0.627378          128          128.0  -1.0000  -0.7236       33
0.547853 0.475529          256          256.0  -1.0000  -2.8076       37
0.515949 0.484045        

In [26]:
!time vw -d ./criteo/validation_all.vw -t -i model_all.vw -p ./criteo/prediction_all.vw

only testing
predictions = ./criteo/prediction_all.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./criteo/validation_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.661167 0.661167            1            1.0  -1.0000  -1.8131       32
2.314956 3.968744            2            2.0  -1.0000  -2.9922       36
1.564023 0.813090            4            4.0  -1.0000  -0.7377       36
2.388949 3.213874            8            8.0  -1.0000  -2.7251       33
2.321873 2.254797           16           16.0  -1.0000  -1.7287       38
2.469008 2.616143           32           32.0  -1.0000  -2.7197       37
2.157523 1.846038           64           64.0   1.0000   0.0731       36
2.093406 2.029290          128          128.0  -1.0000  -1.3315       33
2.437381 2.781356          256          256.0   1.0000  -0.8869       

In [27]:
vw_predictions = open('./criteo/prediction_all.vw', 'r')
vw_predictions = np.array(list(map(float, vw_predictions.readlines())))

roc_auc_score(criteo_target, vw_predictions)

0.77459475177782688

#######################################################################################################################

In [28]:
!time vw ./criteo/train_all.vw --passes 1 --loss_function=logistic --sgd -f model_all.vw

final_regressor = model_all.vw
Num weight bits = 18
learning rate = 10
initial_t = 1
power_t = 0.5
using no cache
Reading datafile = ./criteo/train_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.693147 0.693147            1            1.0  -1.0000   0.0000       38
0.346574 0.000000            2            2.0  -1.0000 -50.0000       38
0.173287 0.000000            4            4.0  -1.0000 -50.0000       29
6.336643 12.500000            8            8.0  -1.0000 -50.0000       37
18.793322 31.250000           16           16.0   1.0000 -50.0000       38
17.209161 15.625000           32           32.0  -1.0000 -50.0000       38
14.854580 12.500000           64           64.0  -1.0000 -50.0000       37
15.630415 16.406250          128          128.0  -1.0000 -50.0000       33
12.698020 9.765625          256          256.0  -1.0000 -50.0000       37
12.403698 12.109

In [29]:
!time vw -d ./criteo/validation_all.vw -t -i model_all.vw -p ./criteo/prediction_all.vw

only testing
predictions = ./criteo/prediction_all.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./criteo/validation_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
2401.000000 2401.000000            1            1.0  -1.0000 -50.0000       32
2401.000000 2401.000000            2            2.0  -1.0000 -50.0000       36
2401.000000 2401.000000            4            4.0  -1.0000 -50.0000       36
2426.000000 2451.000000            8            8.0  -1.0000 -50.0000       33
2426.000000 2426.000000           16           16.0  -1.0000 -50.0000       38
2419.750000 2413.500000           32           32.0  -1.0000 -50.0000       37
2441.625000 2463.500000           64           64.0   1.0000 -50.0000       36
2438.500000 2435.375000          128          128.0  -1.0000 -50.0000       33
2449.437500 2460.37500

In [30]:
vw_predictions = open('./criteo/prediction_all.vw', 'r')
vw_predictions = np.array(list(map(float, vw_predictions.readlines())))

roc_auc_score(criteo_target, vw_predictions)

0.56402661819477928

#######################################################################################################################

In [329]:
!time vw ./criteo/train_all.vw --passes 1 --loss_function=logistic --ftrl -f model_all.vw

final_regressor = model_all.vw
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./criteo/train_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.693147 0.693147            1            1.0  -1.0000   0.0000       38
0.521968 0.350788            2            2.0  -1.0000  -0.8671       38
0.261348 0.000729            4            4.0  -1.0000 -26.7171       29
0.306548 0.351748            8            8.0  -1.0000 -50.0000       37
4.942126 9.577705           16           16.0   1.0000  -1.2749       38
3.544500 2.146873           32           32.0  -1.0000  -0.7166       38
3.104287 2.664074           64           64.0  -1.0000  -2.6841       37
3.546009 3.987732          128          128.0  -1.0000  -0.2571       33


In [330]:
!time vw -d ./criteo/validation_all.vw -t -i model_all.vw -p ./criteo/prediction_all.vw

only testing
predictions = ./criteo/prediction_all.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./criteo/validation_all.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.498393 0.498393            1            1.0  -1.0000  -1.7060       32
0.782872 1.067352            2            2.0  -1.0000  -2.0331       36
0.495007 0.207141            4            4.0  -1.0000  -0.6076       36
2.304135 4.113264            8            8.0  -1.0000  -4.1476       33
2.227381 2.150626           16           16.0  -1.0000  -1.7573       38
3.351526 4.475671           32           32.0  -1.0000  -4.1398       37
2.658678 1.965830           64           64.0   1.0000  -0.1183       36
2.217053 1.775428          128          128.0  -1.0000  -0.9406       33
2.470796 2.724539          256          256.0   1.0000  -0.6382       

In [331]:
vw_predictions = open('./criteo/prediction_all.vw', 'r')
vw_predictions = np.array(list(map(float, vw_predictions.readlines())))

roc_auc_score(criteo_target, vw_predictions)

0.76800912536482679

#######################################################################################################################
#######################################################################################################################

In [137]:
!time ./liblinear-2.20/train ./avazu/train_10k.libfm

Wrong input format at line 1

real	0m0.018s
user	0m0.009s
sys	0m0.003s


In [27]:
libfm_predictions = open('./avazu/prediction_all.libfm', 'r')
libfm_predictions = np.array(list(map(float, libfm_predictions.readlines())))

roc_auc_score(avazu_target, libfm_predictions)

0.72397744280551368

#######################################################################################################################

In [28]:
!time ./libfm/bin/libFM -task c -train ./avazu/train_all.libfm -test ./avazu/validation_all.libfm -out ./avazu/prediction_all.libfm -iter 1 -learn_rate 0.5  -dim '1, 1, 0' -method sgd

----------------------------------------------------------------------------
libFM
  Version: 1.4.2
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 1
has xt = 0
num_rows=32346994	num_values=582245892	num_features=262133	min_target=-1	max_target=1
Loading test... 	
has x = 1
has xt = 0
num_rows=8081973	num_values=145475514	num_features=262133	min_target=-1	max_target=1
#relations: 0
Loading meta data...	
learnrate=0.5
learnrates=0.5,0.5,0.5
#iterations=1
SGD: DON'T FORGET TO SHUFFLE THE ROWS IN TRAINING DATA TO GET THE BEST RESULTS.
#Iter=  0	Train=0.766883	Test=0.766745
Final	Train=0.766883	Test=0.766745

real	12m2.031s
user	10m36.826s
sys	1m15.330s


In [29]:
libfm_predictions = open('./avazu/prediction_all.libfm', 'r')
libfm_predictions = np.array(list(map(float, libfm_predictions.readlines())))

roc_auc_score(avazu_target, libfm_predictions)

0.65110191602370115

#######################################################################################################################

In [30]:
!time ./libfm/bin/libFM -task c -train ./criteo/train_all.libfm -test ./criteo/validation_all.libfm -out ./criteo/prediction_all.libfm -iter 1 -learn_rate 0.5 -dim '1, 1, 0'

----------------------------------------------------------------------------
libFM
  Version: 1.4.2
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=36676650	num_values=1430389350	num_features=262157	min_target=-1	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=9163967	num_values=357394713	num_features=262157	min_target=-1	max_target=1
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.698204	Test=0.698198	Test(ll)=0.606448

real	79m58.181s
user	29m39.111s
sys	15m41.254s


In [31]:
libfm_predictions = open('./criteo/prediction_all.libfm', 'r')
libfm_predictions = np.array(list(map(float, libfm_predictions.readlines())))

roc_auc_score(criteo_target, libfm_predictions)

0.49514811293607919

#######################################################################################################################

In [32]:
!time ./libfm/bin/libFM -task c -train ./criteo/train_all.libfm -test ./criteo/validation_all.libfm -out ./criteo/prediction_all.libfm -iter 1 -learn_rate 0.5  -dim '1, 1, 0' -method sgd

----------------------------------------------------------------------------
libFM
  Version: 1.4.2
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 1
has xt = 0
num_rows=36676650	num_values=1430389350	num_features=262157	min_target=-1	max_target=1
^C


In [33]:
libfm_predictions = open('./criteo/prediction_all.libfm', 'r')
libfm_predictions = np.array(list(map(float, libfm_predictions.readlines())))

roc_auc_score(criteo_target, libfm_predictions)

KeyboardInterrupt: 

#######################################################################################################################
#######################################################################################################################

In [106]:
!g++ -std=c++11 -O2 ./../yasli_new7.0.1/yasli.cpp -o ./../yayasli

In [107]:
!time ./../yasli_new7.0.1/yasli fit -i ../tests/avazu/train_all.yasli -m model_criteo_all.m -e 1 -w 0.5 -b 18 -l logistic -d ' ' -c ./../yasli_new7.0.1/config2.txt

num epoch = 1
weights
weights_cat


 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.119884 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.429921 -0.0328181 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0151067 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00256242 0 0.0839455 0 0 0 0 0 0 0 -0.000243388 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00122658 0 0 0 0 0 0 0 -0.00126381 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.157564 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00643952 -0.381188 0 0 -0.00196829 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0102532 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0214691 0 0 0 0 0 0 0 0 0.00464127 -0.00810566 0 0 0 0 0 0 -0.82704 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0035278 0 0 0 0 0 0 0.0299347 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0547212 0 0 0 -0.00102281 0 0 0 0 0 0 0 0 0 0 0 -0.0163432 -0.00061863 0 0 0 0 0 0 0.0192622 0 0 0 0 0 0 0 0 0 0 0 -0.00516578 0 0 0 0 0 0 0 0.299071 0 0 0 0 -0.00143951 0 0 0 0 0 

0 0.0224057 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00369901 0 0 0 0 0 -0.0464999 0 0 0 0 0 0 0 0 0 0 0.00203768 0 0 0 0.0189187 0 -0.00114116 0 0 0.130534 0 0 0 0 0 0 0 0 0 0 0 -0.0012022 0 0 0 0.00632675 0 0 0 0 0 0 0 0 -0.00148026 0 0.282658 0 0 0 0 0 0 0.107123 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.109543 0 0.637898 0 0 0 0 0 0 0 0 0 0 -0.0019371 0 0 0 0 0 0 0 0 0 0 0 0 -0.00218693 0 0 0 -0.000591623 0 0 0 0 0 0 0.22125 0 0 0 0 -0.00108612 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00609711 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0847472 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.720751 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0657011 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00754889 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0140123 0 0 0 0.000653144 0 0.0184317 0 0 0 0 0 0 0 0 0 0 -0.0058958 0 0 -0.293135 0.0581461 0 0 0 0 0 0 0 0 0 0 -0.000577182 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.000650119 0 0 0 0 0 0 0 0.166005 0.780097 0 0 0 0 0 -0.00568335 -0.0220798 0 0 0 -0.00057925 0 

-0.00786668 0 0 0 0 -0.001088 0 0 0 0 0 0.00175984 0 0 0 0 0 0 0 0 0 -0.19914 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0124819 0.229436 0 0 0 0 0 0 0 0 0 0 0 -0.0136258 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0225076 0 0 0 0 0 -0.00052494 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.017164 0 0 0 0 0 0 0 0 0 -0.00714724 -0.0503313 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00472405 0 0 0 0 0 0 0 0 0 0 0.895258 0 0 0 0 0 0 0 0 0.303441 0 0 0 0 -0.00195865 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00492846 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0200211 0 0 0 0 -0.0353713 0 0 0 0 0 0 0 0 0 0 0 -0.000997723 -0.000633345 0 0 0 0 0 0 -0.0268109 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00807571 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0359025 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0175043 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0191883 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0337029 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0.00652082 0 0 -0.00635186 -0.131873 0 -0.787651 0 0 0 0 0 0 0 0 0 0 -0.000502678 0 0 0 0 0 0 0 0 0 0.00675771 0 0 0 0 0 -0.00282688 0 0 0 0 0 0 0 0 -0.119085 0 -0.0944739 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00978484 0 0 -0.331366 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0450038 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0727667 0 0 0 0 0 0 0 0 1.44451 0 0 0 0 0 0 0 0 0 0 0 0 0.483575 0 0 0 0 0 -0.0233574 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00697042 0 0 0 0 -0.000514318 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00275699 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.745604 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.562404 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0787445 0 0 -0.0351228 0 0 0 0 0 0 0 -0.000602093 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00248241 -0.0179479 0 0 0 0 0 0 0 0.00712191 0 0 0 0 0 0 0 0 0 0 0.0770575 0.0877926 0 0 0 -0.0445567 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

 0 0 0 -0.00277803 0 -0.0419747 0 0 0 0 0 0 0 0 0 0 0 0.00372033 0 0 0 0 0 0 0 0 0 0 0 0 -0.0484131 0 0 0 0 0 0 0 -0.182883 0 0 -0.000648405 0 -0.00586014 0 0 0 0 0 0 0 0 0 0 -0.00115885 0 0 0 0 0 0 0 0 0 0 -0.00373126 0 0 0 0 0 0 0 0 0 0 -0.0267055 0 0 0 0 0 0.0163481 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00098201 0.573446 0 0 0 0 0 0 0 0 0.00757024 0 0 0 0 -0.00137088 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0334655 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0275379 0 0 0 0 0 -0.00162054 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.242628 0 0 0 0 0 0 0 0.0719039 0 0 0 0 0 0.00623719 0 0 0 0 0 0 0 0 0.0110513 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.000807299 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.000867863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0644991 0 0 0 0 0 0 0 0 0 0 0.0182206 -0.00158799 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.00285787 0 0 0 0 -0.0187502 0 -0.0133881 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

0 0 0 0.082027 -0.000323904 0 0 0 0 -0.000498694 0 0 0 0 0 -0.0190919 0 0 0.0477546 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.232958 0 0 0 0 0 0 0 0 0 0 0 -0.00325294 -0.0297391 0 -0.0154042 0 0 0 0 -0.00103993 0 0 0 0 0 0 0.00461918 0 0 0 0 0.428239 0 0.0252752 0 0 0 0 0 0 0 0 0 0 0 0 0.00434999 0 0 0.405941 0 0 0 0 0 0 0.0142808 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00713595 0 0 0 0 -0.000515125 0 0 0 0.00705452 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.125903 -0.210437 0 0 0 0.426189 0 0 0 -0.313049 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0205429 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.047249 0 0 0 0 0 0 0 -0.00422223 0 0.00577758 0 0 0 0 0.0321877 0 0 -0.000400153 0 0 -0.000501723 0 0 0 0 0 0.154956 0 0 0 0 0 0 0 0 0 0 0 0 0.0246551 0 0 0 0 0 0 0 -0.00191864 0 0 0 0 0 0 0 0 0.00515022 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00509629 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00500554 0 0 0 0.190492 0 0 0.0319075 0 0 0 0 0 0 0 0 0 0.0209195 0 -0.0014


real	4m44.840s
user	4m30.706s
sys	0m5.729s


In [108]:
!time ./../yasli_new7.0.1/yasli apply -i ../tests/avazu/validation_all.yasli -m model_criteo_all.m -d ' ' -c ./../yasli_new7.0.1/config2.txt -o ../tests/avazu/prediction_all.yasli

Predicting

real	2m3.056s
user	1m6.940s
sys	0m53.775s


In [109]:
yasli_predictions = open('../tests/avazu/prediction_all.yasli', 'r')
yasli_predictions = np.array(list(map(float, yasli_predictions.readlines())))
roc_auc_score(avazu_target, yasli_predictions)

0.72548885161989041

#######################################################################################################################


In [110]:
!time ./../yasli_new7.0.1/yasli fit -i ../tests/avazu/train_all.yasli -m model_criteo_all.m -e 1 -w 0.5 -b 18 -l logistic -O adagrad -d ' ' -c ./../yasli_new7.0.1/config2.txt

num epoch = 1
weights
weights_cat


 -0.263351 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.49968 0 0 -2.08422 0 0 0 0 0 0.0849185 0 0 -0.62574 -0.375752 0 -0.872975 0 0.452546 0 0 -1.17613 0 0 0 0 0 0 0 0 0 0 -0.499449 0 0 0 0 0 0 0 -0.499574 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -2.36857 0 0 0 0 0 0 0 0 0 -0.499666 0 0 0 0 0 -0.34669 0 -0.0174733 -0.259112 0 0 -0.772388 -2.71772 0 0 0 0 0 0 -1.32231 -1.83114 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.12041 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.32911 0 0 0 -0.673488 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.268065 0 0 0 0 0 0 0 0 0 -2.32271 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.163731 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.192103 0.440232 0 0 0 0 0 0 0 0 0 0 0 0 -1.76074 0 0 0 0 0 0 0 0 0 0 0 -0.747028 0 0 0 0 0.499723 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.70298 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.86631 0 -0.480968 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.499918 0 -1.529

46851 0 0 0 -0.22842 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.618423 0 0 0 0 0 0 -1.49497 0 0 0 0 -0.774744 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.499123 0 -0.497413 0 0 -0.499781 0 0 0 0 0 0 0 0 0 0 -4.17544 0 0 -0.764581 0 0 0 0 -0.499419 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -2.7852 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.499628 0 0 0 -0.74278 0 -0.135632 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.851526 0 -0.417914 0 -0.798309 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -2.43028 0 0 0 0 0 0 0 0 -0.444347 0 0.36995 0 0 0 0 0 0 -0.499506 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.02403 0 -0.429886 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0443934 0 0 0 0 0 0 -0.0203474 -0.112937 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.627302 0 0 0 -0.443346 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.713004 0 0 -1.72201 0 0 0 0 0 -5.63739 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.0854288 0 0 0 0 0 -0.834708 0 -0.784158 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.499836 0 0 0 0 0 0 0 0 0 0 

0 0 -0.268839 0 0 0 0 0 -0.889409 0 0 0.499976 0 0 0 0 0 0 -0.499943 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.499829 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.10038 -0.167788 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.21422 0 -0.560106 0 0 -0.717382 0 0 0 0 0 0 0.26535 0 0 -0.37136 0 0 0 0 0 0 0 0 0 0 0 0 -0.984191 0 -1.27137 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.49989 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.45964 0 0 0 0 0 0 0 0 0 -0.219646 0.289585 0 0 0 0 -2.20935 0 0 0 0 0 -0.204056 0 0 -1.06236 0 0 0 0 -1.8192 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.51321 0 0 0 0 -0.61115 0 0 0 0 0 0 0 0 -0.915434 -0.712545 0 0 -2.83874 0 0 0 0 0 0 -1.81359 0.233345 0 0 -0.499445 0 0 0 0 0 0 0 -1.10272 -1.16846 0 0 0 0 0 -0.499851 0 0 0 0 0 0 0 0 0 0 -2.60591 0 0 0 0 0 0 0 0 0 0 -1.56096 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.607576 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.119991 0 0 0 0 0 0 0 0 0 0 -1.04371 0 -0.528788 -0.807968 0 0 0 0 0 0 0 0 0 0 0 0 0 -2.15726 0 0 0 -1.24157 0 0 -0.49

 0 0 0 0 0 0 0 -0.786381 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.49603 0 0 0 0 0 -2.54243 0 0 0 0 0 -2.92309 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0335126 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.498712 0 0 0 0 0 0 0 0 0 0 0 0 -0.556156 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.32991 0 -1.02609 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.31107 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.688763 0 0 0 -1.18445 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.218487 0 0 0 0 0 0 0 0 0 0 -1.90423 0 0 0 -1.21316 0 0 0 0 -0.48249 -0.971801 0 0 0 0 0 0 0 0 0 0 0 0 -1.3928 0 0 0 0 0 0 0 0 0 -0.625963 0 0 0 0 0 0 0 0 0 0 0 -0.839088 0 0 0 0 0 0 0 0 0 0.907812 0 0 0 0 0 -1.14003 0 0 0 0 0 0 0 0 -2.24378 -0.155015 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.984485 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.225931 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.499982 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -2.75737 0 0 0 0 0 0 0 0

-1.14947 0 0 0 0 -0.0534698 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.977939 0 0 0 0 0 0 0 0 0 -0.495789 0 0 0 0 -2.05856 0 0 0 0 0 0 0 0 -2.30077 0 0 0 -0.349518 0 0 0 0 0 0 0 -0.32404 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.907168 0 0 0 0 -0.530403 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.662542 0.49997 0 0 0 0 0 0 0 0 0 -1.05296 0 0 0 0 0 0.499961 0 0 0 0 0 0 0 0 -1.47288 0 0 0 0 0 0 0 0 -2.28107 0 0 0 -0.816105 0 0 -0.485301 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.48139 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.386565 0 0 0 0 0 0 0 0.49997 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0879076 0 0 0 0 0 0 0 0 -0.00505106 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.484715 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -0.68487 0 0 0 0 0 -1.03189 0 0 0 0 0 0 0 -0.603054 0 0 0 0 0 -0.0103839 0 0 0 0 0 0 0 -0.819803 0 0 0 0.104019 0 0 -1.02005 0 0.614784 0 0 0 0 0 0 0 0 0 -0.621064 0 0 -0.440425 0 0 0.0267443 0 0 0 -0.91212 0 0 0 0 0 0 0 0 -2.05662 0 0 0 0 0 0 0 0 0 0 0 0 0 -1.12557 0 0 0 0 -0.


real	4m34.449s
user	4m24.310s
sys	0m5.225s


In [266]:
!time ./../yasli_new8.0/yasli apply -i ../tests/avazu/validation_all.yasli -m model_criteo_all.m -d ' ' -c ./../yasli_new8.0/config2.txt -o ../tests/avazu/prediction_all.yasli

Predicting
/bin/sh: line 1: 13960 Segmentation fault: 11  ./../yasli_new8.0/yasli apply -i ../tests/avazu/validation_all.yasli -m model_criteo_all.m -d ' ' -c ./../yasli_new8.0/config2.txt -o ../tests/avazu/prediction_all.yasli

real	0m6.760s
user	0m6.549s
sys	0m0.158s


In [112]:
yasli_predictions = open('../tests/avazu/prediction_all.yasli', 'r')
yasli_predictions = np.array(list(map(float, yasli_predictions.readlines())))
roc_auc_score(avazu_target, yasli_predictions)

0.73135197091913107

In [324]:
!make -C ./../yasli_new8.1/

c++ -g -Wall -std=c++11 -O2 -I./ -c argument_parser.cpp -o argument_parser.o
c++ -g -Wall -std=c++11 -O2 -I./ -c data_reader.cpp -o data_reader.o
c++ -g -Wall -std=c++11 -O2 -I./ -c loss_functions.cpp -o loss_functions.o
c++ -g -Wall -std=c++11 -O2 -I./ -c optimizers.cpp -o optimizers.o
c++ -g -Wall -std=c++11 -O2 -I./ -c yasli.cpp -o yasli.o
c++ -g -Wall -std=c++11 -O2 -I./ argument_parser.o data_reader.o loss_functions.o optimizers.o yasli.o -o yasli


In [325]:
!make clean -C ./../yasli_new8.1/

rm *.o


In [317]:
!time ./../yasli_new8.1/yasli fit -i ../tests/criteo/train_all.yasli -m model.m -e 1 -w 0.5 -b 18 -l logistic -d ' ' -c ./../yasli_new8.1/config3.txt

num epoch = 1

real	9m10.360s
user	8m54.472s
sys	0m8.310s


#######################################################################################################################


In [318]:
!time ./../yasli_new8.1/yasli apply -i ../tests/criteo/validation_all.yasli -m model.m -d ' ' -c ./../yasli_new7.0.1/config3.txt -o ../tests/criteo/prediction_all.yasli

Predicting

real	2m58.486s
user	1m47.644s
sys	1m5.981s


In [319]:
yasli_predictions = open('../tests/criteo/prediction_all.yasli', 'r')
yasli_predictions = np.array(list(map(float, yasli_predictions.readlines())))
roc_auc_score(criteo_target, yasli_predictions)

0.61391377933095759

#######################################################################################################################


In [320]:
!time ./../yasli_new8.1/yasli fit -i ../tests/criteo/train_all.yasli -m model.m -e 1 -w 0.5 -b 18 -l logistic -O adagrad -d ' ' -c ./../yasli_new8.1/config3.txt

num epoch = 1

real	9m46.675s
user	9m21.885s
sys	0m10.398s


In [321]:
!time ./../yasli_new8.1/yasli apply -i ../tests/criteo/validation_all.yasli -m model.m -d ' ' -c ./../yasli_new8.1/config3.txt -o ../tests/criteo/prediction_all.yasli

Predicting

real	2m59.447s
user	1m52.526s
sys	1m2.304s


In [322]:
yasli_predictions = open('../tests/criteo/prediction_all.yasli', 'r')
yasli_predictions = np.array(list(map(float, yasli_predictions.readlines())))
roc_auc_score(criteo_target, yasli_predictions)

0.71296063767755213

#######################################################################################################################


In [309]:
!time ./../yasli_new8.1/yasli fit -i ../tests/avazu/train_all.yasli -m model.m -O ftrl -e 1 -w 0.5 -b 18 -l logistic -d ' ' -c ./../yasli_new8.1/config2.txt

num epoch = 1

real	3m39.782s
user	3m31.319s
sys	0m4.579s


In [312]:
!time ./../yasli_new8.1/yasli apply -i ../tests/avazu/validation_all.yasli -m model.m -d ' ' -c ./../yasli_new8.1/config2.txt -o ../tests/avazu/prediction_all.yasli

Predicting

real	2m3.847s
user	1m7.749s
sys	0m53.229s


In [313]:
yasli_predictions = open('../tests/avazu/prediction_all.yasli', 'r')
yasli_predictions = np.array(list(map(float, yasli_predictions.readlines())))
roc_auc_score(avazu_target, yasli_predictions)

0.73124644101750391

#######################################################################################################################


In [314]:
!time ./../yasli_new8.1/yasli fit -i ../tests/criteo/train_all.yasli -m model.m -e 1 -w 0.5 -b 18 -l logistic -O ftrl -d ' ' -c ./../yasli_new8.1/config3.txt

num epoch = 1

real	7m29.298s
user	7m9.190s
sys	0m9.311s


In [315]:
!time ./../yasli_new8.1/yasli apply -i ../tests/criteo/validation_all.yasli -m model.m -d ' ' -c ./../yasli_new8.1/config3.txt -o ../tests/criteo/prediction_all.yasli

Predicting

real	2m50.992s
user	1m48.271s
sys	1m0.158s


In [316]:
yasli_predictions = open('../tests/criteo/prediction_all.yasli', 'r')
yasli_predictions = np.array(list(map(float, yasli_predictions.readlines())))
roc_auc_score(criteo_target, yasli_predictions)

0.76799319791641196