In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

from sklearn.preprocessing import LabelEncoder, StandardScaler

import gc

from model.helper import load_data

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        output = pd.DataFrame()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(X[col])
        else:
            for colname,col in X.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [4]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

In [5]:
class NanInputter(BaseEstimator, TransformerMixin):
    def __init__(self, nan_value):
        self.nan_value = nan_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.fillna(self.nan_value)

In [6]:
class Log(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.log10(X).values.reshape(-1,1)

In [7]:
class DayPeriodic(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        hour = pd.to_datetime(X, unit='s').dt.hour
        sin_hour = np.sin(2*np.pi*hour/24)
        cos_hour = np.cos(2*np.pi*hour/24)
        
        return pd.concat([sin_hour, cos_hour], axis=1)

In [41]:
class DayHour(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        hour = pd.to_datetime(X, unit='s').dt.hour.values
        return hour.reshape(-1,1)

In [9]:
train = pq.read_table('data/train.parquet').to_pandas()
test = pq.read_table('data/test.parquet').to_pandas()

# train = train[]
# test = test[:20000]

gc.collect()

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


2666

In [81]:
features_black_list = ['TransactionID', 'isFraud']

features = train.columns
types = train.dtypes
types = types[~features.isin(features_black_list)]
features = features[~features.isin(features_black_list)]


categorical = features[types =='object']
numerical = features[~(types =='object')]

This features will be processed in a special way and will not be included in automated inputer

In [82]:
special_features = ['TransactionAmt', 'TransactionDT']
categorical = categorical[~categorical.isin(special_features)]
numerical = numerical[~numerical.isin(special_features)]

In [83]:
# wrongly_numerical = pd.Index(['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2'])
# numerical = numerical[~numerical.isin(wrongly_numerical)]
# categorical = categorical.union(wrongly_numerical)

In [84]:
categorical

Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15',
       'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33',
       'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType',
       'DeviceInfo'],
      dtype='object')

In [85]:
numerical

Index(['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2',
       'C1', 'C2',
       ...
       'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25',
       'id_26', 'id_32'],
      dtype='object', length=399)

In [86]:
combined_features = FeatureUnion(
    [
        ('AmountLog', make_pipeline(ItemSelector('TransactionAmt'), Log())),
        ('DayPeriodic', make_pipeline(ItemSelector('TransactionDT'), DayPeriodic())),
        ('CategoricalLabeling', make_pipeline(
            ItemSelector(categorical), 
            NanInputter('None'),
            MultiColumnLabelEncoder())),
        ('NumericalInputter', make_pipeline(
            ItemSelector(numerical),
            NanInputter(-1000)
            ))
    ])

In [87]:
preprocess = make_pipeline(combined_features)

In [88]:
lgbm_params = {
    'task': 'train',
    'metric': 'auc',
    'is_unbalance': True,
    'objective': 'binary',
    'max_bin': 63,
    'num_leaves': 255,
    'learning_rate': 0.01,
}

In [89]:
estimator = make_pipeline(preprocess, lgb.LGBMClassifier(**lgbm_params))

In [90]:
cross_val_score(estimator, train, train.isFraud, cv = TimeSeriesSplit(n_splits=4))

array([0.96546381, 0.96625123, 0.96392285, 0.94783588])

In [28]:
array([0.96647137, 0.96277983, 0.96139974, 0.93686287])

NameError: name 'array' is not defined

In [120]:
estimator.fit(train, train.isFraud)

Pipeline(memory=None,
         steps=[('pipeline',
                 Pipeline(memory=None,
                          steps=[('featureunion',
                                  FeatureUnion(n_jobs=None,
                                               transformer_list=[('AmountLog',
                                                                  Pipeline(memory=None,
                                                                           steps=[('itemselector',
                                                                                   ItemSelector(key='TransactionAmt')),
                                                                                  ('log',
                                                                                   Log())],
                                                                           verbose=False)),
                                                                 ('DayPeriodic',
                                                           

In [123]:
sample_submission = pq.read_table('data/sub.parquet').to_pandas()
sample_submission['isFraud'] = estimator.predict_proba(test)[:,1]

In [124]:
sample_submission.to_csv('result.csv', index=False)