In [1]:
import os
import sys
import collections
import random
import numpy as np
from tqdm import tqdm_notebook

In [2]:
# There are 13 integer features and 26 categorical features
continous_features = range(1, 14)
categorial_features = range(14, 40)

# Clip integer features. The clip point for each integer feature
# is derived from the 95% quantile of the total values in each feature
continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]

class ContinuousFeatureGenerator:
    """
    Normalize the integer features to [0, 1] by min-max normalization
    """

    def __init__(self, num_feature):
        self.num_feature = num_feature
        self.min = [sys.maxsize] * num_feature
        self.max = [-sys.maxsize] * num_feature

    def build(self, datafile, continous_features):
        with open(datafile, 'r') as f:
            for line in f:
                features = line.rstrip('\n').split('\t')
                for i in range(0, self.num_feature):
                    val = features[continous_features[i]]
                    if val != '':
                        val = int(val)
                        if val > continous_clip[i]:
                            val = continous_clip[i]
                        self.min[i] = min(self.min[i], val)
                        self.max[i] = max(self.max[i], val)

    def gen(self, idx, val):
        if val == '':
            return 0.0
        val = float(val)
        return (val - self.min[idx]) / (self.max[idx] - self.min[idx])

class CategoryDictGenerator:
    """
    Generate dictionary for each of the categorical features
    """

    def __init__(self, num_feature):
        self.dicts = []
        self.num_feature = num_feature
        for i in range(0, num_feature):
            self.dicts.append(collections.defaultdict(int))

    def build(self, datafile, categorial_features, cutoff=0):
        with open(datafile, 'r') as f:
            for line in f:
                features = line.rstrip('\n').split('\t')
                for i in range(0, self.num_feature):
                    if features[categorial_features[i]] != '':
                        self.dicts[i][features[categorial_features[i]]] += 1
        for i in range(0, self.num_feature):
            # 去点频率小于cutoff的特征
            self.dicts[i] = filter(lambda x: x[1] >= cutoff,
                                   self.dicts[i].items())

            self.dicts[i] = sorted(self.dicts[i], key=lambda x: (-x[1], x[0]))
            vocabs, _ = list(zip(*self.dicts[i]))
            # 每个字符值编码
            self.dicts[i] = dict(zip(vocabs, range(1, len(vocabs) + 1)))
            self.dicts[i]['<unk>'] = 0

    def gen(self, idx, key):
        if key not in self.dicts[idx]:
            res = self.dicts[idx]['<unk>']
        else:
            res = self.dicts[idx][key]
        return res

    def dicts_sizes(self):
        return list(map(len, self.dicts))

In [6]:
datadir = "../data/criteo"
outdir = "../data/criteo/output"

In [8]:
def preprocess(datadir, outdir):
    """
    All the 13 integer features are normalzied to continous values and these
    continous features are combined into one vecotr with dimension 13.

    Each of the 26 categorical features are one-hot encoded and all the one-hot
    vectors are combined into one sparse binary vector.
    """
    dists = ContinuousFeatureGenerator(len(continous_features))
    dists.build(os.path.join(datadir, 'train.txt'), continous_features)

    dicts = CategoryDictGenerator(len(categorial_features))
    dicts.build(
        os.path.join(datadir, 'train.txt'), categorial_features, cutoff=10)#200 50

    dict_sizes = dicts.dicts_sizes()
    categorial_feature_offset = [0]
    for i in range(1, len(categorial_features)):
        offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1]
        categorial_feature_offset.append(offset)

    random.seed(0)

    # 90% of the data are used for training, and 10% of the data are used
    # for validation.

    with open(os.path.join(outdir, 'train.txt'), 'w') as out_train:
        with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid:
            with open(os.path.join(datadir, 'train.txt'), 'r') as f:
                for line in tqdm_notebook(f):
                    features = line.rstrip('\n').split('\t')
                    continous_feats = []
                    continous_vals = []
                    for i in range(0, len(continous_features)):

                        val = dists.gen(i, features[continous_features[i]])
                        continous_vals.append(
                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))
                        continous_feats.append(
                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

                    categorial_lgb_vals = []
                    for i in range(0, len(categorial_features)):
                        val_lgb = dicts.gen(i, features[categorial_features[i]])
                        categorial_lgb_vals.append(str(val_lgb))

                    continous_vals = ','.join(continous_vals)
                    categorial_lgb_vals = ','.join(categorial_lgb_vals)
                    label = features[0]
                    if random.randint(0, 9999) % 10 != 0:               
                        out_train.write(','.join([continous_vals, categorial_lgb_vals, label]) + '\n')
                        
                    else:
                        out_valid.write(','.join([continous_vals, categorial_lgb_vals, label]) + '\n')
                        


    with open(os.path.join(outdir, 'test.txt'), 'w') as out:
        with open(os.path.join(datadir, 'test.txt'), 'r') as f:
            for line in tqdm_notebook(f):
                features = line.rstrip('\n').split('\t')

                continous_feats = []
                continous_vals = []
                for i in range(0, len(continous_features)):
                    val = dists.gen(i, features[continous_features[i] - 1])
                    continous_vals.append(
                        "{0:.6f}".format(val).rstrip('0').rstrip('.'))
                    continous_feats.append(
                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

                categorial_lgb_vals = []
                for i in range(0, len(categorial_features)):

                    val_lgb = dicts.gen(i, features[categorial_features[i] - 1])
                    categorial_lgb_vals.append(str(val_lgb))

                continous_vals = ','.join(continous_vals)
                categorial_lgb_vals = ','.join(categorial_lgb_vals)
                
                out.write(','.join([continous_vals, categorial_lgb_vals]) + '\n')

    return dicts

In [7]:
def preprocess(datadir, outdir):
    """
    All the 13 integer features are normalzied to continous values and these
    continous features are combined into one vecotr with dimension 13.

    Each of the 26 categorical features are one-hot encoded and all the one-hot
    vectors are combined into one sparse binary vector.
    """
    dists = ContinuousFeatureGenerator(len(continous_features))
    dists.build(os.path.join(datadir, 'train.txt'), continous_features)

    dicts = CategoryDictGenerator(len(categorial_features))
    dicts.build(
        os.path.join(datadir, 'train.txt'), categorial_features, cutoff=10)#200 50

    dict_sizes = dicts.dicts_sizes()
    categorial_feature_offset = [0]
    for i in range(1, len(categorial_features)):
        offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1]
        categorial_feature_offset.append(offset)

    random.seed(0)

    # 90% of the data are used for training, and 10% of the data are used
    # for validation.
    train_fm = open(os.path.join(outdir, 'train_fm.txt'), 'wb')
    valid_fm = open(os.path.join(outdir, 'valid_fm.txt'), 'wb')

#     train_lgb = open(os.path.join(outdir, 'train_lgb.txt'), 'w')
#     valid_lgb = open(os.path.join(outdir, 'valid_lgb.txt'), 'w')

#     with open(os.path.join(outdir, 'train.txt'), 'w') as out_train:
#         with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid:
    with open(os.path.join(datadir, 'train.txt'), 'r') as f:
        for line in tqdm_notebook(f):
            features = line.rstrip('\n').split('\t')
            continous_feats = []
            continous_vals = []
            for i in range(0, len(continous_features)):

                val = dists.gen(i, features[continous_features[i]])
                continous_vals.append(
                    "{0:.6f}".format(val).rstrip('0').rstrip('.'))
                continous_feats.append(
                    "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

            categorial_vals = []
            categorial_lgb_vals = []
            for i in range(0, len(categorial_features)):
                val = dicts.gen(i, features[categorial_features[i]]) + categorial_feature_offset[i]
                categorial_vals.append(str(val))

            continous_vals = ','.join(continous_vals)
            categorial_vals = ','.join(categorial_vals)
            label = features[0]
            if random.randint(0, 9999) % 10 != 0:
#                         out_train.write(','.join(
#                             [continous_vals, categorial_vals, label]) + '\n')
                train_val = []
                train_val.append(str(label))
                train_val.extend(['{}:{}'.format(ii, val) for ii,val in enumerate(continous_vals.split(','))])
                train_val.extend(['{}:1'.format(str(np.int32(val) + 13)) for val in categorial_vals.split(',')])
                train_fm.write((" ".join(train_val) + "\n").encode('ascii'))
#                 train_fm.write('\t'.join(label) + '\t')
#                 train_fm.write(('\t'.join(
#                     ['{}:{}'.format(ii, val) for ii,val in enumerate(continous_vals.split(','))]) + '\t')).encode('ascii')
#                 train_fm.write(('\t'.join(
#                     ['{}:1'.format(str(np.int32(val) + 13)) for val in categorial_vals.split(',')]) + '\n')).encode('ascii')


            else:
#                         out_valid.write(','.join(
#                             [continous_vals, categorial_vals, label]) + '\n')
                valid_val = []
                valid_val.append(str(label))
                valid_val.extend(['{}:{}'.format(ii, val) for ii,val in enumerate(continous_vals.split(','))])
                valid_val.extend(['{}:1'.format(str(np.int32(val) + 13)) for val in categorial_vals.split(',')])
                valid_fm.write((" ".join(valid_val) + "\n").encode('ascii'))
#                 valid_fm.write('\t'.join(label) + '\t')
#                 valid_fm.write(('\t'.join(
#                     ['{}:{}'.format(ii, val) for ii, val in enumerate(continous_vals.split(','))]) + '\t')).encode('ascii')
#                 valid_fm.write(('\t'.join(
#                     ['{}:1'.format(str(np.int32(val) + 13)) for val in categorial_vals.split(',')]) + '\n')).encode('ascii')
                                            
                        
    train_fm.close()
    valid_fm.close()


    test_fm = open(os.path.join(outdir, 'test_fm.txt'), 'wb')

#     with open(os.path.join(outdir, 'test.txt'), 'w') as out:
    with open(os.path.join(datadir, 'test.txt'), 'r') as f:
        for line in tqdm_notebook(f):
            features = line.rstrip('\n').split('\t')

            continous_feats = []
            continous_vals = []
            for i in range(0, len(continous_features)):
                val = dists.gen(i, features[continous_features[i] - 1])
                continous_vals.append(
                    "{0:.6f}".format(val).rstrip('0').rstrip('.'))
                continous_feats.append(
                        "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

            categorial_vals = []
            categorial_lgb_vals = []
            for i in range(0, len(categorial_features)):
                val = dicts.gen(i,
                                features[categorial_features[i] -
                                         1]) + categorial_feature_offset[i]
                categorial_vals.append(str(val))

                val_lgb = dicts.gen(i, features[categorial_features[i] - 1])
                categorial_lgb_vals.append(str(val_lgb))

            continous_vals = ','.join(continous_vals)
            categorial_vals = ','.join(categorial_vals)

#                 out.write(','.join([continous_vals, categorial_vals]) + '\n')
            test_val = []
            test_val.extend(['{}:{}'.format(ii, val) for ii,val in enumerate(continous_vals.split(','))])
            test_val.extend(['{}:1'.format(str(np.int32(val) + 13)) for val in categorial_vals.split(',')])
            test_fm.write((" ".join(test_val) + "\n").encode('ascii'))
#             test_fm.write(('\t'.join(['{}:{}'.format(ii, val) for ii,val in enumerate(continous_vals.split(','))]) + '\t').encode('ascii'))
#             test_fm.write(('\t'.join(
#                 ['{}:1'.format(str(np.int32(val) + 13)) for val in categorial_vals.split(',')]) + '\n').encode('ascii'))
                                                                
    test_fm.close()
    return dict_sizes

In [54]:
# def preprocess(datadir, outdir):
#     """
#     All the 13 integer features are normalzied to continous values and these
#     continous features are combined into one vecotr with dimension 13.

#     Each of the 26 categorical features are one-hot encoded and all the one-hot
#     vectors are combined into one sparse binary vector.
#     """
#     dists = ContinuousFeatureGenerator(len(continous_features))
#     dists.build(os.path.join(datadir, 'train.txt'), continous_features)

#     dicts = CategoryDictGenerator(len(categorial_features))
#     dicts.build(
#         os.path.join(datadir, 'train.txt'), categorial_features, cutoff=200)#200 50

#     dict_sizes = dicts.dicts_sizes()
#     categorial_feature_offset = [0]
#     for i in range(1, len(categorial_features)):
#         offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1]
#         categorial_feature_offset.append(offset)

#     random.seed(0)

#     # 90% of the data are used for training, and 10% of the data are used
#     # for validation.
#     train_ffm = open(os.path.join(outdir, 'train_ffm.txt'), 'w')
#     valid_ffm = open(os.path.join(outdir, 'valid_ffm.txt'), 'w')

# #     train_lgb = open(os.path.join(outdir, 'train_lgb.txt'), 'w')
# #     valid_lgb = open(os.path.join(outdir, 'valid_lgb.txt'), 'w')

#     with open(os.path.join(outdir, 'train.txt'), 'w') as out_train:
#         with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid:
#             with open(os.path.join(datadir, 'train.txt'), 'r') as f:
#                 for line in tqdm_notebook(f):
#                     features = line.rstrip('\n').split('\t')
#                     continous_feats = []
#                     continous_vals = []
#                     for i in range(0, len(continous_features)):

#                         val = dists.gen(i, features[continous_features[i]])
#                         continous_vals.append(
#                             "{0:.6f}".format(val).rstrip('0').rstrip('.'))
#                         continous_feats.append(
#                             "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

#                     categorial_vals = []
#                     categorial_lgb_vals = []
#                     for i in range(0, len(categorial_features)):
#                         val = dicts.gen(i, features[categorial_features[i]]) + categorial_feature_offset[i]
#                         categorial_vals.append(str(val))
#                         val_lgb = dicts.gen(i, features[categorial_features[i]])
#                         categorial_lgb_vals.append(str(val_lgb))

#                     continous_vals = ','.join(continous_vals)
#                     categorial_vals = ','.join(categorial_vals)
#                     label = features[0]
#                     if random.randint(0, 9999) % 10 != 0:
#                         out_train.write(','.join(
#                             [continous_vals, categorial_vals, label]) + '\n')
#                         train_ffm.write('\t'.join(label) + '\t')
#                         train_ffm.write('\t'.join(
#                             ['{}:{}:{}'.format(ii, ii, val) for ii,val in enumerate(continous_vals.split(','))]) + '\t')
#                         train_ffm.write('\t'.join(
#                             ['{}:{}:1'.format(ii + 13, str(np.int32(val) + 13)) for ii, val in enumerate(categorial_vals.split(','))]) + '\n')
                        
# #                         train_lgb.write('\t'.join(label) + '\t')
# #                         train_lgb.write('\t'.join(continous_feats) + '\t')
# #                         train_lgb.write('\t'.join(categorial_lgb_vals) + '\n')

#                     else:
#                         out_valid.write(','.join(
#                             [continous_vals, categorial_vals, label]) + '\n')
#                         valid_ffm.write('\t'.join(label) + '\t')
#                         valid_ffm.write('\t'.join(
#                             ['{}:{}:{}'.format(ii, ii, val) for ii,val in enumerate(continous_vals.split(','))]) + '\t')
#                         valid_ffm.write('\t'.join(
#                             ['{}:{}:1'.format(ii + 13, str(np.int32(val) + 13)) for ii, val in enumerate(categorial_vals.split(','))]) + '\n')
                                                
# #                         valid_lgb.write('\t'.join(label) + '\t')
# #                         valid_lgb.write('\t'.join(continous_feats) + '\t')
# #                         valid_lgb.write('\t'.join(categorial_lgb_vals) + '\n')
                        
#     train_ffm.close()
#     valid_ffm.close()

# #     train_lgb.close()
# #     valid_lgb.close()

#     test_ffm = open(os.path.join(outdir, 'test_ffm.txt'), 'w')
# #     test_lgb = open(os.path.join(outdir, 'test_lgb.txt'), 'w')

#     with open(os.path.join(outdir, 'test.txt'), 'w') as out:
#         with open(os.path.join(datadir, 'test.txt'), 'r') as f:
#             for line in tqdm_notebook(f):
#                 features = line.rstrip('\n').split('\t')

#                 continous_feats = []
#                 continous_vals = []
#                 for i in range(0, len(continous_features)):
#                     val = dists.gen(i, features[continous_features[i] - 1])
#                     continous_vals.append(
#                         "{0:.6f}".format(val).rstrip('0').rstrip('.'))
#                     continous_feats.append(
#                             "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

#                 categorial_vals = []
#                 categorial_lgb_vals = []
#                 for i in range(0, len(categorial_features)):
#                     val = dicts.gen(i,
#                                     features[categorial_features[i] -
#                                              1]) + categorial_feature_offset[i]
#                     categorial_vals.append(str(val))

#                     val_lgb = dicts.gen(i, features[categorial_features[i] - 1])
#                     categorial_lgb_vals.append(str(val_lgb))

#                 continous_vals = ','.join(continous_vals)
#                 categorial_vals = ','.join(categorial_vals)

#                 out.write(','.join([continous_vals, categorial_vals]) + '\n')
                
#                 test_ffm.write('\t'.join(['{}:{}:{}'.format(ii, ii, val) for ii,val in enumerate(continous_vals.split(','))]) + '\t')
#                 test_ffm.write('\t'.join(
#                     ['{}:{}:1'.format(ii + 13, str(np.int32(val) + 13)) for ii, val in enumerate(categorial_vals.split(','))]) + '\n')
                                                                
# #                 test_lgb.write('\t'.join(continous_feats) + '\t')
# #                 test_lgb.write('\t'.join(categorial_lgb_vals) + '\n')

#     test_ffm.close()
# #     test_lgb.close()
#     return dict_sizes

In [55]:
preprocess(datadir="../data/criteo", outdir="../data/criteo/output")

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




[532,
 533,
 11289,
 13822,
 151,
 13,
 9005,
 254,
 4,
 11557,
 4191,
 11560,
 3037,
 27,
 5640,
 12459,
 11,
 2790,
 1391,
 4,
 12005,
 10,
 15,
 9617,
 52,
 7599]

In [87]:
preprocess(datadir=datadir, outdir=outdir)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




[532,
 533,
 11289,
 13822,
 151,
 13,
 9005,
 254,
 4,
 11557,
 4191,
 11560,
 3037,
 27,
 5640,
 12459,
 11,
 2790,
 1391,
 4,
 12005,
 10,
 15,
 9617,
 52,
 7599]

In [8]:
preprocess(datadir=datadir, outdir=outdir)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




[1458,
 555,
 193948,
 138800,
 306,
 18,
 11970,
 634,
 4,
 42646,
 5178,
 192772,
 3175,
 27,
 11422,
 181074,
 11,
 4654,
 2031,
 4,
 189656,
 17,
 16,
 59696,
 85,
 45570]

In [9]:
dim_dict = preprocess(datadir=datadir, outdir=outdir)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [12]:
dim_dict.dicts_sizes()

[1458,
 555,
 193948,
 138800,
 306,
 18,
 11970,
 634,
 4,
 42646,
 5178,
 192772,
 3175,
 27,
 11422,
 181074,
 11,
 4654,
 2031,
 4,
 189656,
 17,
 16,
 59696,
 85,
 45570]

In [55]:
dim_dict = [1458,
 555,
 193948,
 138800,
 306,
 18,
 11970,
 634,
 4,
 42646,
 5178,
 192772,
 3175,
 27,
 11422,
 181074,
 11,
 4654,
 2031,
 4,
 189656,
 17,
 16,
 59696,
 85,
 45570]

In [56]:
len(dim_dict)

26

In [None]:
f.write((line_pattern % feat).encode('ascii'))

In [183]:
with open(os.path.join("../data/criteo/output_fm", "train_sub1k_transform.txt"), "wb") as out:
    with open(os.path.join("../data/criteo/output_fm", "train_sub1k.txt"), "r", encoding="utf-8") as f:
        for v in f:
            res = []
            for i, pair in enumerate(v.split("\t")):
                if i != 0:
                    vals = pair.split(":")
                    res.append("%d:%.16f" % (int(vals[0]), float(vals[1])))
                else:
                    val = int(pair)
                    res.append("%d" % val)
            out.write((" ".join(res) + "\n").encode('ascii'))

In [195]:
with open(os.path.join("../data/criteo/output_fm", "test_sub1k_transform.txt"), "wb") as out:
    with open(os.path.join("../data/criteo/output_fm", "test_sub1k.txt"), "r", encoding="utf-8") as f:
        for v in f:
            res = []
            for i, pair in enumerate(v.split("\t")):
                if i != 0:
                    vals = pair.split(":")
                    res.append("%d:%.16f" % (int(vals[0]), float(vals[1])))
                else:
                    val = 0
                    res.append("%d" % val)
            out.write((" ".join(res) + "\n").encode('ascii'))

In [169]:
with open(os.path.join("../data/criteo/output_fm", "train_sub1k.txt"), "r") as f:
    v = f.readline()

In [170]:
v

'0\t0:0.05\t1:0.006633\t2:0.05\t3:0\t4:0.021594\t5:0.008\t6:0.15\t7:0.04\t8:0.362\t9:0.1\t10:0.2\t11:0\t12:0.04\t15:1\t555:1\t1078:1\t17797:1\t26190:1\t26341:1\t28570:1\t35361:1\t35613:1\t35984:1\t48424:1\t51364:1\t64053:1\t65964:1\t66206:1\t71628:1\t84088:1\t84119:1\t86889:1\t88280:1\t88283:1\t100288:1\t100300:1\t102447:1\t109932:1\t111823:1\n'

In [172]:
v.split("\t")

['0',
 '0:0.05',
 '1:0.006633',
 '2:0.05',
 '3:0',
 '4:0.021594',
 '5:0.008',
 '6:0.15',
 '7:0.04',
 '8:0.362',
 '9:0.1',
 '10:0.2',
 '11:0',
 '12:0.04',
 '15:1',
 '555:1',
 '1078:1',
 '17797:1',
 '26190:1',
 '26341:1',
 '28570:1',
 '35361:1',
 '35613:1',
 '35984:1',
 '48424:1',
 '51364:1',
 '64053:1',
 '65964:1',
 '66206:1',
 '71628:1',
 '84088:1',
 '84119:1',
 '86889:1',
 '88280:1',
 '88283:1',
 '100288:1',
 '100300:1',
 '102447:1',
 '109932:1',
 '111823:1\n']

In [158]:
list(v.split("\t"))[-2].split(":")[1]

'1'

In [181]:
import pandas as pd
import numpy as np
from sklearn.datasets import dump_svmlight_file

df = pd.DataFrame()
df['Id'] = np.arange(10)
df['F1'] = np.random.rand(10,)
df['F2'] = np.random.rand(10,)
df['Target'] = list(map(lambda x: -1 if x < 0.5 else 1, np.random.rand(10,)))

# X = df[np.setdiff1d(df.columns,['Id','Target'])]
X = df[["Id", "F1", "F2"]]
y = df.Target

print(df)

dump_svmlight_file(X,y,'smvlight.dat',zero_based=True,multilabel=False)


   Id        F1        F2  Target
0   0  0.197551  0.987172      -1
1   1  0.979598  0.122393       1
2   2  0.367976  0.984271      -1
3   3  0.282004  0.946666       1
4   4  0.720250  0.589025      -1
5   5  0.077527  0.516489       1
6   6  0.405469  0.982702      -1
7   7  0.595486  0.850159      -1
8   8  0.436349  0.987760       1
9   9  0.319877  0.691997      -1


In [180]:
X

Unnamed: 0,F1,F2
0,0.123643,0.064664
1,0.591428,0.267432
2,0.987804,0.781589
3,0.599837,0.204119
4,0.40897,0.306321
5,0.673347,0.087296
6,0.883486,0.607485
7,0.077832,0.507009
8,0.439109,0.884698
9,0.888054,0.032524


In [88]:
from pathlib import Path

In [101]:
a = Path("../data/criteo/output_test/model.fm")

In [102]:
a.parent

WindowsPath('../data/criteo/output_test')

In [104]:
if not a.parent.exists():
    a.parent.mkdir()
if not a.exists():
    a.touch()

In [111]:
import pandas as pd

In [115]:
c = pd.DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"], index=["a", "b", "c"])

In [136]:
import sys
sys.path.append(r"D:\Users\hao.guo\deepctr\pywfm")

In [137]:
from pywfm import FM

In [110]:
with a.open(mode="r") as f:
    for v in f:
        if v.startswith("a"):
            print(v)
        else:
            continue

aaa



In [105]:
a.read().decode("utf-8").splitlines()

AttributeError: 'WindowsPath' object has no attribute 'read'

In [6]:
import subprocess

In [None]:
def split_csv(path, nr_thread, has_header):
    # 计算每个子文件的数据行数
    def calc_nr_lines_per_thread():
        # 调用linux的wc命令获取文件行数
        nr_lines = int(list(subprocess.Popen('sudo wc -l {0}'.format(path), shell=True, stdout=subprocess.PIPE).stdout)[0].split()[0])
        if not has_header:
            nr_lines += 1 # wc命令会自动跳过第一行不算
        return nr_lines/nr_thread

    # 打开子文件，并写入header
    def open_with_header_written(path, idx, header):
        f = open(path+'.__tmp__.{0}'.format(idx), 'w')
        if not has_header:
            return f
        f.write(header)
        return f

    # 打开文件且跳过header
    def open_with_first_line_skipped(path, skip=True):
        f = open(path)
        if not skip:
            return f
        next(f)
        return f

    header = open(path).readline()

    nr_lines_per_thread = calc_nr_lines_per_thread()

    # 遍历源文件写入各子文件
    idx = 0
    f = open_with_header_written(path, idx, header)
    for i, line in enumerate(open_with_first_line_skipped(path, has_header), start=1):
        f.write(line)
        if i % nr_lines_per_thread == 0:
            if idx < nr_thread - 1:
                f.close()
                idx += 1
                f = open_with_header_written(path, idx, header)
    f.close()

In [10]:
import tensorflow as tf
import numpy as np

In [40]:
w = np.random.randn(16, 8)

In [41]:
a = tf.ones(shape=(50, 16))
w = tf.convert_to_tensor(w)
a = tf.expand_dims(a, axis=2)
a = tf.tile(a, [1, 1, 16])

In [42]:
w = tf.cast(w, dtype="float")

In [45]:
t = tf.tensordot(a, w, axes=([1], [0]))

In [46]:
t.shape

TensorShape([50, 16, 8])

In [9]:
np.log(0.05 / (1 - 0.05))

-2.9444389791664403

In [42]:
def arr2sparse(arr):
    arr_tensor = tf.constant(arr)
    arr_idx = tf.where(tf.not_equal(arr_tensor, 0))
    arr_sparse = tf.SparseTensor(arr_idx, tf.gather_nd(arr_tensor, arr_idx), arr_tensor.shape)
    return arr_sparse

In [43]:
ids = np.random.randint(50, size=(50, 6))
# ids = tf.convert_to_tensor(ids)
# ids = tf.SparseTensor(ids)
ids = arr2sparse(ids)

In [44]:
res = tf.nn.embedding_lookup_sparse(
    a, ids, sp_weights=None, combiner="sum"
)

In [49]:
nums = np.random.random(size=(50, 8))
nums = tf.constant(nums)

In [50]:
embeds = np.random.random(size=(8, 4))
embeds = tf.constant(embeds)

In [51]:
nums = tf.expand_dims(nums, axis=2)
nums.shape

TensorShape([50, 8, 1])

In [53]:
a = nums * embeds

In [54]:
a.shape

TensorShape([50, 8, 4])