In [1]:
import os
import sys
import gc
import warnings

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from tqdm import tqdm
from functools import partial
from scipy.stats import skew, kurtosis, iqr
from sklearn.externals import joblib

%matplotlib inline

In [2]:
pd.options.display.max_columns = None

In [None]:
PATH = os.path.join('..', 'input')

train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

In [None]:
categorical_columns = [col for col in train.columns if train[col].dtype == 'object']

In [None]:
categorical_columns

In [None]:
train = train.sort_values('first_active_month').reset_index(drop=True)

In [None]:
train.head()

In [None]:
test = test.sort_values('first_active_month').reset_index(drop=True)

In [None]:
test.head()

In [None]:
train.nunique()

In [None]:
test.nunique()

In [None]:
historical_transactions = pd.read_csv('../remove_outlier_data/historical_transactions.csv')

In [None]:
historical_transactions.nunique()

In [None]:
historical_transactions = historical_transactions.sort_values('purchase_date').reset_index(drop=True)

In [None]:
historical_transactions.head()

In [None]:
new_merchant_transactions = pd.read_csv(os.path.join('../input', 'new_merchant_transactions.csv'))

In [None]:
new_merchant_transactions.head()

In [None]:
new_merchant_transactions.nunique()

In [None]:
new_merchant_transactions.authorized_flag.unique()

In [None]:
merchants = pd.read_csv('../remove_outlier_data/merchants.csv')

In [None]:
merchants.head()

In [None]:
merchants.nunique()

In [None]:
features = []

features += [f'f10{i}.pkl' for i in (2, 4)]
features += [f'f11{i}_{j}.pkl' for i in (1, 2) 
                               for j in ('Y', 'N')]
features += [f'f12{i}.pkl' for i in (1,)]
features += [f'f13{i}.pkl' for i in (1, 2)]

features += [f'f20{i}.pkl' for i in (2,)]
features += [f'f23{i}.pkl' for i in (1, 2)]

features += [f'f30{i}.pkl' for i in (2, 3, 4,)]

In [None]:
KEY = 'card_id'

train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

for f in tqdm(features):
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on=KEY, how='left')
    test = pd.merge(test, t, on=KEY, how='left')

In [None]:
cols = train.columns.values
for f in [
    'new_purchase_date_max', 'new_purchase_date_min',
    'hist_purchase_date_max', 'hist_purchase_date_min', 
    'Y_hist_auth_purchase_date_max', 'Y_hist_auth_purchase_date_min', 
    'N_hist_auth_purchase_date_max', 'N_hist_auth_purchase_date_min',
    'Y_new_auth_purchase_date_max', 'Y_new_auth_purchase_date_min', 
    'N_new_auth_purchase_date_max', 'N_new_auth_purchase_date_min',
]:
    if f in cols:
        train[f] = train[f].astype(np.int64) * 1e-9
        test[f] = test[f].astype(np.int64) * 1e-9

In [None]:
y = train['target']
del train['target']

In [None]:
train.nunique()

In [None]:
for f in train.columns:
    print(f, train[f].nunique(), test[f].nunique())

In [None]:
historical_transactions.head()

In [None]:
historical_transactions['installments_exception'] = historical_transactions['installments'].apply(lambda x: np.where(x == -1, 1, 0))

In [None]:
new_merchant_transactions.query('installments == -1')

In [None]:
historical_transactions.head()

In [None]:
train.head()

In [None]:
train.card_id.nunique(), len(train.card_id.unique())

In [None]:
import os
import gc

import pandas as pd
import numpy as np
import pickle as pkl
from datetime import date

from keras.layers.normalization import BatchNormalization
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dot, Reshape, Add, Subtract
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.regularizers import l2

PREF = 'f503'

KEY = 'card_id'

SEED = 18
np.random.seed(SEED)

# =============================================================================
# def
# =============================================================================
def get_embed(x_input, x_size, k_latent):
    if x_size > 0:  
        embed = Embedding(x_size, k_latent, input_length=1,
                          embeddings_regularizer=l2(embedding_reg))(x_input)
        embed = Flatten()(embed)
    else:
        embed = Dense(k_latent, kernel_regularizer=l2(embedding_reg))(x_input)
    return embed


def build_model_1(X, fsize):
    dim_input = len(fsize)

    input_x = [Input(shape=(1,)) for i in range(dim_input)]

    biases = [get_embed(x, size, 1) for (x, size) in zip(input_x, fsize)]

    factors = [get_embed(x, size, k_latent)
               for (x, size) in zip(input_x, fsize)]

    s = Add()(factors)

    diffs = [Subtract()([s, x]) for x in factors]

    dots = [Dot(axes=1)([d, x]) for d, x in zip(diffs, factors)]

    x = Concatenate()(biases + dots)
    x = BatchNormalization()(x)
    output = Dense(1, activation='relu', kernel_regularizer=l2(kernel_reg))(x)
    model = Model(inputs=input_x, outputs=[output])
    opt = Adam(clipnorm=0.5)
    model.compile(optimizer=opt, loss='mean_squared_error')
    output_f = factors + biases
    model_features = Model(inputs=input_x, outputs=output_f)

    return model, model_features

In [None]:
df = pd.read_csv(os.path.join(PATH, 'historical_transactions.csv'))
df['purchase_date'] = pd.to_datetime(df['purchase_date'])

In [None]:
features = ['city_id', 'merchant_category_id', 'state_id', 'subsector_id']
fsize = [int(df[f].max()) + 1 for f in features]

X = df.groupby(features)['card_id'].count()

X = X.unstack().fillna(0)
X = X.stack().astype('float32')
X = np.log1p(X).reset_index()
X.columns = features + ['num']

X_train = np.array([X[f].values for f in features])
y_train = (X[['num']].values).astype('float32')

In [None]:
# X_train = X_train.transpose((1, 0))

In [None]:
y_train = y_train.transpose((1, 0))

In [None]:
X.nunique()

In [None]:
X_train.shape, y_train.shape

In [None]:
k_latent = 1
embedding_reg = 0.0002
kernel_reg = 0.1

model, model_features = build_model_1(X_train, fsize)

n_epochs = 1000

batch_size = 2 ** 17
model, model_features = build_model_1(X_train, fsize)
earlystopper = EarlyStopping(patience=0, verbose=50)

history = model.fit(
    X_train,  y_train,
    epochs=n_epochs, batch_size=batch_size, verbose=1, shuffle=True,
    validation_data=(X_train, y_train),
    callbacks=[earlystopper],
)

X_pred = model_features.predict(X_train, batch_size=batch_size)

factors = X_pred[:len(features)]

biases = X_pred[len(features):2*len(features)]

for f, X_p in zip(features, factors):
    for i in range(k_latent):
        X['%s_fm_factor_%d' % (f, i)] = X_p[:, i]

for f, X_p in zip(features, biases):
    X['%s_fm_bias' % (f)] = X_p[:, 0]

In [None]:
historical_transactions = pd.read_csv('../input/historical_transactions.csv')

In [None]:
historical_transactions[historical_transactions.category_2.isna()].shape

In [None]:
historical_transactions.shape

In [None]:
historical_transactions[historical_transactions.category_3.isna()].shape

In [None]:
historical_transactions.head()

In [None]:
historical_transactions[historical_transactions.merchant_id.isna()].shape

In [None]:
historical_transactions[['category_1', 'category_2']].apply('max', axis=1)

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys
import gc
import numpy as np
import pandas as pd

from tqdm import tqdm
from time import time, sleep
import datetime
from itertools import combinations
from multiprocessing import cpu_count, Pool

PATH = os.path.join('..', 'input')

In [None]:
train = pd.read_csv('../input/train.csv', parse_dates=['first_active_month'])
test = pd.read_csv('../input/test.csv', parse_dates=['first_active_month'])

In [None]:
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1

test['target'] = np.nan

df = pd.concat([train, test], axis=0)

del train, test
gc.collect()

df['first_active_month'] = pd.to_datetime(df['first_active_month'])

df['quarter'] = df['first_active_month'].dt.quarter
df['elapsed_time'] = (datetime.date(2018, 4, 30) - df['first_active_month'].dt.date).dt.days

df['days_feature1'] = df['elapsed_time'] * df['feature_1']
df['days_feature2'] = df['elapsed_time'] * df['feature_2']
df['days_feature3'] = df['elapsed_time'] * df['feature_3']

df['days_feature1_ratio'] = df['feature_1'] / df['elapsed_time']
df['days_feature2_ratio'] = df['feature_2'] / df['elapsed_time']
df['days_feature3_ratio'] = df['feature_3'] / df['elapsed_time']

In [None]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [None]:
# df, cols = utils.one_hot_encoder(df, nan_as_category=False)

for f in ['feature_1','feature_2','feature_3']:
    order_label = df.groupby([f])['outliers'].mean()
    df[f] = df[f].map(order_label)

df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
df['feature_mean'] = df['feature_sum'] / 3

features = ['feature_1', 'feature_2', 'feature_3']
t = df[features]
df['feature_max'] = t.max(axis=1)
df['feature_min'] = t.min(axis=1)
df['feature_var'] = t.std(axis=1)

# train = df[df['target'].notnull()]
# test = df[df['target'].isnull()]
# del df
# gc.collect()

In [None]:
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']

In [None]:
categorical_columns

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.head()

In [None]:
f102 = pd.read_pickle('../remove_outlier_feature/f102.pkl')

In [None]:
f102.shape

In [None]:
f102.head()

In [None]:
f202 = pd.read_pickle('../remove_outlier_feature/f202.pkl')

In [None]:
f202.shape

In [None]:
f202.head()

In [None]:
PATH = os.path.join('..', 'remove_outlier_data')

train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

features = []

features += [f'f10{i}.pkl' for i in (2,)]
features += [f'f20{i}.pkl' for i in (2,)]

KEY = 'card_id'

for f in tqdm(features):
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on=KEY, how='left')
    test = pd.merge(test, t, on=KEY, how='left')

In [None]:
new_merchant_transactions = pd.read_csv('../remove_outlier_data/new_merchant_transactions.csv')

In [None]:
new_merchant_transactions.head()

In [None]:
sum(new_merchant_transactions['installments'].apply(lambda x: np.where(x == np.nan, 1, 0)))

In [None]:
historical_transactions = pd.read_csv('../remove_outlier_data/historical_transactions.csv')

In [None]:
historical_transactions['installments_exception'] = historical_transactions['installments'].apply(lambda x: np.where(x == np.nan, 1, 0))

In [None]:
historical_transactions.installments_exception.unique()

In [None]:
historical_transactions.category_3.unique()

In [None]:
historical_transactions.category_2.unique()

In [None]:
historical_transactions.category_1.unique()

In [None]:
hist_temp = historical_transactions[['category_2', 'category_3']]

In [None]:
hist_temp['category_2'] = hist_temp['category_2'].astype(int)

In [None]:
hist_temp.head()

In [None]:
pd.get_dummies(hist_temp, columns=['category_2', 'category_3']).columns

In [None]:
f107 = pd.read_pickle('../remove_outlier_feature/f107.pkl')

In [None]:
f205 = pd.read_pickle('../remove_outlier_feature/f205.pkl')

In [None]:
f107.head()

In [None]:
f205.head()

In [None]:
f107.columns

In [None]:
f205.columns

In [None]:
f107.columns[1][4:]

In [None]:
train = pd.read_csv('../input/train.csv')

In [None]:
train.head()

In [None]:
historical_transactions.head()

In [None]:
historical_transactions = pd.read_csv('../input/historical_transactions.csv')

In [None]:
new_merchant_transactions = pd.read_csv('../input/new_merchant_transactions.csv')

In [None]:
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])

In [None]:
new_merchant_transactions['purchase_date'] = pd.to_datetime(new_merchant_transactions['purchase_date'])

In [None]:
historical_transactions_min_date = historical_transactions.groupby('card_id')['purchase_date'].min().reset_index()

In [None]:
new_merchant_transactions_min_date = new_merchant_transactions.groupby('card_id')['purchase_date'].min().reset_index()

In [None]:
historical_transactions_min_date.head()

In [None]:
new_merchant_transactions_min_date.head()

In [None]:
historical_transactions_min_date = historical_transactions_min_date.rename(columns={'purchase_date': 'hist_purchase_date'})
new_merchant_transactions_min_date = new_merchant_transactions_min_date.rename(columns={'purchase_date': 'new_purchase_date'})

In [None]:
train = pd.merge(train, historical_transactions_min_date, on='card_id', how='left')

In [None]:
train = pd.merge(train, new_merchant_transactions_min_date, on='card_id', how='left')

In [None]:
train['first_active_month'] = pd.to_datetime(train['first_active_month'])
train['hist_days'] = (train['hist_purchase_date'] - train['first_active_month']).dt.days
train['new_days'] = (train['new_purchase_date'] - train['first_active_month']).dt.days

In [None]:
train.head()

In [None]:
train.hist_days.min()

In [None]:
train.new_days.min()

In [None]:
train = train.query('hist_days < 0').sort_values('hist_days')

In [None]:
sns.distplot(train['hist_days'])
plt.show()

In [None]:
sns.distplot(train.loc[train['new_days'].notnull(), 'new_days'])
plt.show()

In [None]:
historical_transactions.head()

In [None]:
historical_transactions['-1_installments'] = historical_transactions['installments'].apply(lambda x: np.where(x == -1, 1, 0))
historical_transactions['999_installments'] = historical_transactions['installments'].apply(lambda x: np.where(x == 999, 1, 0))

In [None]:
historical_transactions['999_installments'].unique()

In [None]:
f102 = pd.read_pickle('../remove_outlier_feature/f102.pkl')

In [None]:
f102.head()

In [None]:
[c for c in f102.columns if ('duration' in c) or ('amount_month_ratio' in c)]

In [None]:
PATH = os.path.join('..', 'remove_outlier_data')

KEY = 'card_id'

train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

features = []

features += [f'f10{i}.pkl' for i in (2, 7, 8)]
features += [f'f20{i}.pkl' for i in (2, 5, 6)]

for f in tqdm(features):
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on=KEY, how='left')
    test = pd.merge(test, t, on=KEY, how='left')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.nunique()

In [3]:
PATH = os.path.join('..', 'remove_outlier_data')

historical_transactions = pd.read_csv(os.path.join(PATH, 'historical_transactions.csv'))

In [5]:
historical_transactions = historical_transactions.groupby('card_id').agg({'card_id': ['count', 'size']})

In [7]:
historical_transactions = historical_transactions.reset_index()

In [8]:
historical_transactions.columns = [f'{c[0]}_{c[1]}' for c in historical_transactions.columns]

In [9]:
historical_transactions.head()

Unnamed: 0,card_id_,card_id_count,card_id_size
0,C_ID_00007093c1,149,149
1,C_ID_0001238066,123,123
2,C_ID_0001506ef0,66,66
3,C_ID_0001793786,216,216
4,C_ID_000183fdda,144,144


In [10]:
sum(historical_transactions.card_id_count != historical_transactions.card_id_size)

0