# import library

In [None]:
import os
import gc
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from glob import glob
from datetime import datetime, date
from collections import defaultdict
from multiprocessing import cpu_count, Pool
from tqdm import tqdm

import eli5
from eli5.lightgbm import explain_weights_lightgbm
from eli5.sklearn import PermutationImportance
from pdpbox import pdp, get_dataset, info_plots
import shap 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import warnings
warnings.simplefilter('ignore')

In [None]:
pd.options.display.max_columns = None

# params

In [None]:
PATH = os.path.join('..', 'remove_outlier_data')

params = {
    'num_leaves': 31,
    'min_data_in_leaf': 30, 
    'objective':'regression',
    'max_depth': -1,
    'learning_rate': 0.01,
    'boosting': 'gbdt',
    'feature_fraction': 0.8,
    'bagging_freq': 1,
    'bagging_fraction': 0.8,
    'bagging_seed': 11,
    'metric': 'rmse',
    'lambda_l1': 0.1,
    'verbosity': -1,
    'nthread': cpu_count(),
    'random_state': 6,
}

# features

In [None]:
features = []
features +=  [f'f10{i}.pkl' for i in (2, 3)]
features += [f'f11{i}_{j}.pkl' for i in (1, 2) 
                               for j in ('Y', 'N')]
features += [f'f12{i}.pkl' for i in (1, 2)]


features += [f'f20{i}.pkl' for i in (2, 3)]
features += [f'f21{i}_{j}.pkl' for i in (1, 2)
                               for j in ('Y', 'N')]

features += [f'f40{i}.pkl' for i in (2, 3)]
features += [f'f41{i}_{j}.pkl' for i in (1, 2)
                               for j in ('Y', 'N')]
features += [f'f42{i}.pkl' for i in (1, 2)]

# read csv

In [None]:
train = pd.read_csv(os.path.join(PATH, 'train.csv'))

for f in tqdm(features):
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on='card_id', how='left')

# data to int

In [None]:
cols = train.columns.values
for f in [
    'new_purchase_date_max', 'new_purchase_date_min',
    'hist_purchase_date_max', 'hist_purchase_date_min', 
    'N_hist_auth_purchase_date_max', 'N_hist_auth_purchase_date_min',
    'Y_hist_auth_purchase_date_max', 'Y_hist_auth_purchase_date_min', 
    'Y_new_auth_purchase_date_max', 'Y_new_auth_purchase_date_min', 
    'N_new_auth_purchase_date_max', 'N_new_auth_purchase_date_min',
    'Y_new_auth_purchase_date_max_x', 'Y_new_auth_purchase_date_min_x', 
    'N_new_auth_purchase_date_max_x', 'N_new_auth_purchase_date_min_x', 
    'Y_new_auth_purchase_date_max_y', 'Y_new_auth_purchase_date_min_y', 
    'N_new_auth_purchase_date_max_y', 'N_new_auth_purchase_date_min_y'
]:
    if f in cols:
        train[f] = train[f].astype(np.int64) * 1e-9

# preprocess

In [None]:
y = train['target']

col_not_to_use = ['first_active_month', 'card_id', 'target']
col_to_use = [c for c in train.columns if c not in col_not_to_use]

train = train[col_to_use]
train['feature_3'] = train['feature_3'].astype(int)

categorical_features = ['feature_1', 'feature_2', 'feature_3']

for col in categorical_features:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))

gc.collect()

# model

In [None]:
X = train

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=6)

for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    dtrain = lgb.Dataset(X.iloc[train_index], label=y.iloc[train_index])
    dvalid = lgb.Dataset(X.iloc[valid_index], label=y.iloc[valid_index])

    model = lgb.train(
        params,
        dtrain,
        20000,          
        valid_sets=[dtrain, dvalid],
        verbose_eval=200,
        early_stopping_rounds=20)

# shape

In [None]:
explainer = shap.TreeExplainer(model)

shap_values = explainer.shap_values(X.iloc[valid_index])
shap.summary_plot(shap_values, X.iloc[valid_index], max_display=300)

# libFFM

In [1]:
import os
import gc

import pandas as pd
import numpy as np
import pickle as pkl
from datetime import date
from tqdm import tqdm

from keras.layers.normalization import BatchNormalization
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dot, Reshape, Add, Subtract
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.regularizers import l2

#==============================================================================
PREF = 'f501'

KEY = 'card_id'

# =============================================================================
# def
# =============================================================================

def get_embed(x_input, x_size, k_latent):
    if x_size > 0:  
        embed = Embedding(x_size, k_latent, input_length=1,
                          embeddings_regularizer=l2(embedding_reg))(x_input)
        embed = Flatten()(embed)
    else:
        embed = Dense(k_latent, kernel_regularizer=l2(embedding_reg))(x_input)
    return embed


def build_model_1(X, fsize):
    dim_input = len(fsize)

    input_x = [Input(shape=(1,)) for i in range(dim_input)]

    biases = [get_embed(x, size, 1) for (x, size) in zip(input_x, fsize)]

    factors = [get_embed(x, size, k_latent)
               for (x, size) in zip(input_x, fsize)]

    s = Add()(factors)

    diffs = [Subtract()([s, x]) for x in factors]

    dots = [Dot(axes=1)([d, x]) for d, x in zip(diffs, factors)]

    x = Concatenate()(biases + dots)
    x = BatchNormalization()(x)
    output = Dense(1, activation='relu', kernel_regularizer=l2(kernel_reg))(x)
    model = Model(inputs=input_x, outputs=[output])
    opt = Adam(clipnorm=0.5)
    model.compile(optimizer=opt, loss='mean_squared_error')
    output_f = factors + biases
    model_features = Model(inputs=input_x, outputs=output_f)

    return model, model_features

# =============================================================================
# main
# =============================================================================


train = pd.read_csv('../remove_outlier_data/train.csv')
test = pd.read_csv('../remove_outlier_data/test.csv')


features = []
features +=  [f'f10{i}.pkl' for i in (2, 3)]
features += [f'f11{i}_{j}.pkl' for i in (1, 2) 
                               for j in ('Y', 'N')]
features += [f'f12{i}.pkl' for i in (1, 2)]


features += [f'f20{i}.pkl' for i in (2, 3)]
features += [f'f21{i}_{j}.pkl' for i in (1, 2)
                               for j in ('Y', 'N')]

features += [f'f40{i}.pkl' for i in (2, 3)]
features += [f'f41{i}_{j}.pkl' for i in (1, 2)
                               for j in ('Y', 'N')]
features += [f'f42{i}.pkl' for i in (1, 2)]

for f in tqdm(features):
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on='card_id', how='left')
    test = pd.merge(test, t, on='card_id', how='left')

df = pd.concat([train, test], axis=0, sort=False)
del train, test
gc.collect()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
100%|██████████| 22/22 [00:20<00:00,  1.09it/s]


140

In [2]:
df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,outliers_mean,first_active_month_year,first_active_month_weekday,first_active_month_month,...,cnt_std_4_2017_y,cnt_std_4_2018,cnt_std_5_2017_y,cnt_std_6_2017_y,cnt_std_7_2017_y,cnt_std_8_2017_y,cnt_std_9_2017_y,cnt_std_10_2017_y,cnt_std_11_2017_y,cnt_std_12_2017_y
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,0.009014,2017,3,6,...,0.0,0.0,0.0,0.707031,3.558594,2.548828,0.447266,0.514648,0.736816,0.850098
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,0.009898,2017,6,1,...,0.0,0.0,0.97168,1.738281,1.924805,0.577148,0.358643,0.344238,0.887695,2.158203
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,0.008133,2016,0,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.828125,0.0,0.0
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,0.00982,2017,4,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.316162,1.22168,0.0,0.447266
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,0.011128,2017,2,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.344727,1.06543


In [3]:
df.columns

Index(['first_active_month', 'card_id', 'feature_1', 'feature_2', 'feature_3',
       'target', 'outliers_mean', 'first_active_month_year',
       'first_active_month_weekday', 'first_active_month_month',
       ...
       'cnt_std_4_2017_y', 'cnt_std_4_2018', 'cnt_std_5_2017_y',
       'cnt_std_6_2017_y', 'cnt_std_7_2017_y', 'cnt_std_8_2017_y',
       'cnt_std_9_2017_y', 'cnt_std_10_2017_y', 'cnt_std_11_2017_y',
       'cnt_std_12_2017_y'],
      dtype='object', length=599)

In [4]:
for c, d in zip(df.columns, df.dtypes):
    if ('int' in str(d)):
        print('int:', c)
#     if ('float' in str(d)):
#         print('float:', c)

int: feature_1
int: feature_2
int: feature_3
int: first_active_month_year
int: first_active_month_weekday
int: first_active_month_month
int: first_active_month_weekofyear
int: first_active_month_quarter
int: first_active_month_is_month_start
int: elapsed_time
int: days_feature_1
int: days_feature_2
int: days_feature_3
int: hist_transactions_count
int: hist_category_1_sum
int: hist_category_2_nunique
int: hist_category_3_nunique
int: hist_merchant_id_nunique
int: hist_state_id_nunique
int: hist_subsector_id_nunique
int: hist_city_id_nunique
int: hist_merchant_category_id_nunique
int: hist_installments_nunique
int: hist_purchase_month_max
int: hist_purchase_month_min
int: hist_month_diff_max
int: hist_month_diff_min
int: hist_purchase_date_diff
int: hist_purchase_date_uptonow
int: Y_hist_auth_category_1_sum
int: Y_hist_auth_category_2_nunique
int: Y_hist_auth_category_3_nunique
int: Y_hist_auth_merchant_id_nunique
int: Y_hist_auth_state_id_nunique
int: Y_hist_auth_subsector_id_nunique
in

In [5]:
SEED = 18
np.random.seed(SEED)

features = [
#     'feature_1', 'feature_2', 'feature_3', 
#     'days_feature_1', 'days_feature_2', 'days_feature_3',
    'hist_category_1_sum', 'hist_category_2_nunique', 'hist_category_3_nunique',
    'Y_new_auth_category_1_sum', 'Y_new_auth_category_2_nunique', 'Y_new_auth_category_3_nunique',
    'union_category_1_sum', 'union_category_2_nunique', 'union_category_3_nunique',
]

fsize = [int(df[f].max()) + 1 for f in features]

In [6]:
X = df.groupby(features)['card_id'].count()

X = X.unstack().fillna(0)
X = X.stack().astype('float32')
X = np.log1p(X).reset_index()
X.columns = features + ['num']

X_train = [X[f].values for f in features]
y_train = (X[['num']].values).astype('float32')

k_latent = 3
embedding_reg = 0.0002
kernel_reg = 0.1

model, model_features = build_model_1(X_train, fsize)

n_epochs = 20

batch_size = 2 ** 17
model, model_features = build_model_1(X_train, fsize)
earlystopper = EarlyStopping(patience=0, verbose=50)

history = model.fit(
    X_train,  y_train,
    epochs=n_epochs, batch_size=batch_size, verbose=1, shuffle=True,
    validation_data=(X_train, y_train),
    callbacks=[earlystopper],
)
# model.save('weights/{}_weights.h5'.format(str(date.today()).replace('-', '')))

X_pred = model_features.predict(X_train, batch_size=batch_size)

factors = X_pred[:len(features)]

biases = X_pred[len(features):2*len(features)]

for f, X_p in zip(features, factors):
    for i in range(k_latent):
        X['%s_fm_factor_%d' % (f, i)] = X_p[:, i]

# for f, X_p in zip(features, biases):
#     for i in range(k_latent):
#         X['%s_fm_bias' % (f, i)] = X_p[:, 0i]

Train on 456216 samples, validate on 456216 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
X.shape

(456216, 37)

In [9]:
df.shape

(325540, 599)

In [10]:
X.head()

Unnamed: 0,hist_category_1_sum,hist_category_2_nunique,hist_category_3_nunique,Y_new_auth_category_1_sum,Y_new_auth_category_2_nunique,Y_new_auth_category_3_nunique,union_category_1_sum,union_category_2_nunique,union_category_3_nunique,num,...,Y_new_auth_category_3_nunique_fm_factor_2,union_category_1_sum_fm_factor_0,union_category_1_sum_fm_factor_1,union_category_1_sum_fm_factor_2,union_category_2_nunique_fm_factor_0,union_category_2_nunique_fm_factor_1,union_category_2_nunique_fm_factor_2,union_category_3_nunique_fm_factor_0,union_category_3_nunique_fm_factor_1,union_category_3_nunique_fm_factor_2
0,0,0,1,0.0,0.0,1.0,0,0,1,5.332719,...,0.070399,0.066835,-0.009863,-0.075,0.058415,0.002382,0.043799,0.108552,0.005149,-0.116753
1,0,0,1,0.0,0.0,1.0,0,0,2,3.610918,...,0.070399,0.066835,-0.009863,-0.075,0.058415,0.002382,0.043799,0.047153,-0.069048,-0.104757
2,0,0,1,0.0,0.0,1.0,0,0,3,0.0,...,0.070399,0.066835,-0.009863,-0.075,0.058415,0.002382,0.043799,-0.054853,-0.044287,0.122805
3,0,0,1,0.0,0.0,1.0,0,0,4,0.0,...,0.070399,0.066835,-0.009863,-0.075,0.058415,0.002382,0.043799,-0.052913,0.074434,-0.024833
4,0,0,1,0.0,0.0,2.0,0,0,1,0.0,...,-0.037986,0.066835,-0.009863,-0.075,0.058415,0.002382,0.043799,0.108552,0.005149,-0.116753


In [None]:
df = pd.merge(df[['card_id', 'feature_1', 'feature_2', 'feature_3']], X, on=['feature_1', 'feature_2', 'feature_3'], how='left')
df = df.drop(features, axis=1)
df.to_pickle(f'../remove_outlier_feature/{PREF}.pkl')

# FFM

In [16]:
import numpy as np
from sklearn.base import BaseEstimator
from keras.layers import Input, Embedding, Dense,Flatten, merge,Activation
from keras.models import Model
from keras.regularizers import l2 as l2_reg
from keras import initializers
import itertools


def make_batches(size, batch_size):
    nb_batch = int(np.ceil(size/float(batch_size)))
    return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]


def batch_generator(X,y,batch_size=128,shuffle=True):
    sample_size = X[0].shape[0]
    index_array = np.arange(sample_size)
    while 1:
        if shuffle:
            np.random.shuffle(index_array)
        batches = make_batches(sample_size, batch_size)
        for batch_index, (batch_start, batch_end) in enumerate(batches):
            batch_ids = index_array[batch_start:batch_end]
            X_batch = [X[i][batch_ids] for i in range(len(X))]
            y_batch = y[batch_ids]
            yield X_batch,y_batch


def test_batch_generator(X,y,batch_size=128):
    sample_size = X[0].shape[0]
    index_array = np.arange(sample_size)
    batches = make_batches(sample_size, batch_size)
    for batch_index, (batch_start, batch_end) in enumerate(batches):
        batch_ids = index_array[batch_start:batch_end]
        X_batch = [X[i][batch_ids] for i in range(len(X))]
        y_batch = y[batch_ids]
        yield X_batch,y_batch


def predict_batch(model,X_t,batch_size=128):
    outcome = []
    for X_batch,y_batch in test_batch_generator(X_t,np.zeros(X_t[0].shape[0]),batch_size=batch_size):
        outcome.append(model.predict(X_batch,batch_size=batch_size))
    outcome = np.concatenate(outcome).ravel()
    return outcome



def build_model(max_features,K=8,solver='adam',l2=0.0,l2_fm = 0.0):

    inputs = []
    flatten_layers=[]
    columns = range(len(max_features))
    for c in columns:
        inputs_c = Input(shape=(1,), dtype='int32',name = 'input_%s'%c)
        num_c = max_features[c]

        embed_c = Embedding(
                        num_c,
                        K,
                        input_length=1,
                        name = 'embed_%s'%c,
                        W_regularizer=l2_reg(l2_fm)
                        )(inputs_c)

        flatten_c = Flatten()(embed_c)
        inputs.append(inputs_c)
        flatten_layers.append(flatten_c)

    fm_layers = []
    for emb1,emb2 in itertools.combinations(flatten_layers, 2):
        dot_layer = merge([emb1,emb2],mode='dot',dot_axes=1)
        fm_layers.append(dot_layer)

    for c in columns:
        num_c = max_features[c]
        embed_c = Embedding(
                        num_c,
                        1,
                        input_length=1,
                        name = 'linear_%s'%c,
                        W_regularizer=l2_reg(l2)
                        )(inputs[c])

        flatten_c = Flatten()(embed_c)

        fm_layers.append(flatten_c)
        
        
    flatten = merge(fm_layers,mode='sum')
    outputs = Activation('sigmoid',name='outputs')(flatten)
    
    model = Model(input=inputs, output=outputs)

    model.compile(
                optimizer=solver,
                loss= 'binary_crossentropy'
              )

    return model

class KerasFM(BaseEstimator):
    def __init__(self,max_features=[],K=8,solver='adam',l2=0.0,l2_fm = 0.0):
        self.model = build_model(max_features,K,solver,l2=l2,l2_fm = l2_fm)

    def fit(self,X,y,batch_size=128,nb_epoch=10,shuffle=True,verbose=1,validation_data=None):
        self.model.fit(X,y,batch_size=batch_size,nb_epoch=nb_epoch,shuffle=shuffle,verbose=verbose,validation_data=None)

    def fit_generator(self,X,y,batch_size=128,nb_epoch=10,shuffle=True,verbose=1,validation_data=None,callbacks=None):
        tr_gen = batch_generator(X,y,batch_size=batch_size,shuffle=shuffle)
        if validation_data:
            X_test,y_test = validation_data
            te_gen = batch_generator(X_test,y_test,batch_size=batch_size,shuffle=False)
            nb_val_samples = X_test[-1].shape[0]
        else:
            te_gen = None
            nb_val_samples = None

        self.model.fit_generator(
                tr_gen, 
                samples_per_epoch=X[-1].shape[0], 
                nb_epoch=nb_epoch, 
                verbose=verbose, 
                callbacks=callbacks, 
                validation_data=te_gen, 
                nb_val_samples=nb_val_samples, 
                max_q_size=10
                )

    def predict(self,X,batch_size=128):
        y_preds = predict_batch(self.model,X,batch_size=batch_size)
        return y_preds

In [69]:
PATH = os.path.join('..', 'remove_outlier_data')

KEY = 'card_id'

In [80]:
features = []

features +=  [f'f10{i}.pkl' for i in (2, 3)]
# features += [f'f11{i}_{j}.pkl' for i in (1, 2) 
#                                for j in ('Y', 'N')]
# features += [f'f12{i}.pkl' for i in (1, 2)]


features += [f'f20{i}.pkl' for i in (2, 3)]
# features += [f'f21{i}_{j}.pkl' for i in (1, 2)
#                                for j in ('Y', 'N')]

features += [f'f40{i}.pkl' for i in (2, 3)]
# features += [f'f41{i}_{j}.pkl' for i in (1, 2)
#                                for j in ('Y', 'N')]
# features += [f'f42{i}.pkl' for i in (1, 2)]


# features = os.listdir('../remove_outlier_feature')

In [81]:
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

for f in tqdm(features):
    # print(f'Merge: {f}', end=' ')
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on=KEY, how='left')
    test = pd.merge(test, t, on=KEY, how='left')
    # print('Done!!')

100%|██████████| 6/6 [00:03<00:00,  1.73it/s]


In [82]:
cols = train.columns.values
for f in [
    'new_purchase_date_max', 'new_purchase_date_min',
    'hist_purchase_date_max', 'hist_purchase_date_min', 
    'N_hist_auth_purchase_date_max', 'N_hist_auth_purchase_date_min',
    'Y_hist_auth_purchase_date_max', 'Y_hist_auth_purchase_date_min', 
    'Y_new_auth_purchase_date_max', 'Y_new_auth_purchase_date_min', 
    'N_new_auth_purchase_date_max', 'N_new_auth_purchase_date_min',
    'Y_new_auth_purchase_date_max_x', 'Y_new_auth_purchase_date_min_x', 
    'N_new_auth_purchase_date_max_x', 'N_new_auth_purchase_date_min_x', 
    'Y_new_auth_purchase_date_max_y', 'Y_new_auth_purchase_date_min_y', 
    'N_new_auth_purchase_date_max_y', 'N_new_auth_purchase_date_min_y'
]:
    if f in cols:
        train[f] = train[f].astype(np.int64) * 1e-9
        test[f] = test[f].astype(np.int64) * 1e-9

In [83]:
y = train['target']

In [84]:
int_cols = [c for d, c in zip(train.dtypes, train.columns) if str(d).startswith('int')]
len(int_cols)

39

In [85]:
max_features = [train[c].max() + 1 for c in int_cols]
len(max_features)

39

In [86]:
for c in int_cols:
    trainno = len(train[c].unique())
    testno = len(test[c].unique())
    print(c,trainno,testno)

feature_1 5 5
feature_2 3 3
feature_3 2 2
first_active_month_year 8 8
first_active_month_weekday 7 7
first_active_month_month 12 12
first_active_month_weekofyear 21 21
first_active_month_quarter 4 4
first_active_month_is_month_start 1 1
elapsed_time 75 75
days_feature_1 224 223
days_feature_2 177 171
days_feature_3 75 76
hist_transactions_count 1023 943
hist_category_1_sum 1014 922
hist_category_2_nunique 6 6
hist_category_3_nunique 4 4
hist_merchant_id_nunique 313 296
hist_state_id_nunique 20 19
hist_subsector_id_nunique 34 34
hist_city_id_nunique 58 49
hist_merchant_category_id_nunique 92 87
hist_installments_nunique 13 13
hist_purchase_month_max 11 11
hist_purchase_month_min 11 11
hist_month_diff_max 14 14
hist_month_diff_min 14 14
hist_purchase_date_diff 422 420
hist_purchase_date_uptonow 391 388
union_transactions_count 1039 941
union_category_1_sum 1018 926
union_category_2_nunique 6 6
union_category_3_nunique 4 4
union_merchant_id_nunique 344 327
union_state_id_nunique 22 20
uni

In [87]:
x = train[int_cols].values
y = train['target'].values

In [88]:
test.insert(1,'target',0)

In [89]:
test.head()

Unnamed: 0,first_active_month,target,card_id,feature_1,feature_2,feature_3,outliers_mean,first_active_month_year,first_active_month_weekday,first_active_month_month,...,union_purchase_amount_mean_mean,union_purchase_amount_mean_std,union_purchase_amount_std_mean,union_purchase_amount_std_std,union_installments_mean_mean,union_installments_mean_std,union_installments_sum_mean,union_installments_sum_std,union_installments_std_mean,union_installments_std_std
0,2017-04-01,0,C_ID_0ab67a22ab,3,3,1,0.016459,2017,5,4,...,-0.602051,0.103577,0.151367,0.126709,1.841797,0.877441,14.601562,12.820312,1.272461,1.043945
1,2017-01-01,0,C_ID_130fd0cbdd,2,3,0,0.009522,2017,6,1,...,-0.640137,0.02655,0.10907,0.058075,1.101562,0.17627,11.75,7.667969,0.278564,0.402588
2,2017-08-01,0,C_ID_b709037bc5,5,1,1,0.015883,2017,1,8,...,0.197144,0.851562,0.644531,0.396973,4.785156,2.705078,8.289062,5.964844,2.376953,2.761719
3,2017-12-01,0,C_ID_d27d835a9f,2,1,0,0.013387,2017,4,12,...,-0.570801,0.051727,0.105774,0.04834,2.199219,1.477539,13.796875,10.132812,2.347656,2.720703
4,2015-12-01,0,C_ID_2b5e3df5c2,5,1,1,0.015883,2015,1,12,...,1.09082,2.490234,2.353516,2.533203,1.086914,0.149658,8.398438,6.726562,0.212646,0.322021


In [90]:
categories = int_cols
numerics = []

currentcode = len(numerics)
catdict = {}
catcodes = {}
for x in numerics:
    catdict[x] = 0
for x in categories:
    catdict[x] = 1

noofrows = train.shape[0]
noofcolumns = len(features)
with open("alltrainffm.txt", "w") as text_file:
    for n, r in enumerate(range(noofrows)):
        if((n%100000)==0):
            print('Row',n)
        datastring = ""
        datarow = train.iloc[r].to_dict()
        datastring += str(int(datarow['target']))


        for i, x in enumerate(catdict.keys()):
            if(catdict[x]==0):
                datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
            else:
                if(x not in catcodes):
                    catcodes[x] = {}
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode
                elif(datarow[x] not in catcodes[x]):
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode

                code = catcodes[x][datarow[x]]
                datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"
        datastring += '\n'
        text_file.write(datastring)
        
noofrows = test.shape[0]
noofcolumns = len(features)
with open("alltestffm.txt", "w") as text_file:
    for n, r in enumerate(range(noofrows)):
        if((n%100000)==0):
            print('Row',n)
        datastring = ""
        datarow = test.iloc[r].to_dict()
        datastring += str(int(datarow['target']))

        for i, x in enumerate(catdict.keys()):
            if(catdict[x]==0):
                datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
            else:
                if(x not in catcodes):
                    catcodes[x] = {}
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode
                elif(datarow[x] not in catcodes[x]):
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode

                code = catcodes[x][datarow[x]]
                datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"
        datastring += '\n'
        text_file.write(datastring)

Row 0
Row 100000
Row 200000
Row 0
Row 100000


In [92]:
import numpy as np
import xlearn as xl
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load dataset
X = train[int_cols].values
y = train['target'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)

# param:
#  0. binary classification
#  1. model scale: 0.1
#  2. epoch number: 10 (auto early-stop)
#  3. learning rate: 0.1
#  4. regular lambda: 1.0
#  5. use sgd optimization method
linear_model = xl.LRModel(task='reg', init=0.1,
                          epoch=10, lr=0.1,
                          reg_lambda=1.0, opt='sgd', metric='rmse')

# Start to train
linear_model.fit(X_train, y_train,
                 eval_set=[X_val, y_val],
                 is_lock_free=False)

# Generate predictions
y_pred = linear_model.predict(X_val)

In [102]:
y_pred

array([nan, nan, nan, ..., nan, nan, nan])

In [None]:
import xlearn as xl

# Training task
ffm_model = xl.create_ffm()                # Use field-aware factorization machine (ffm)
ffm_model.setTrain("./alltrainffm.txt.txt")    # Path of training data

# param:
#  0. task: binary classification
#  1. learning rate : 0.2
#  2. regular lambda : 0.002
param = {'task':'binary', 'lr':0.2, 'lambda':0.002}

# Train model
ffm_model.fit(param, "./model.out")