In [1]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'
%matplotlib inline

In [2]:
import pandas as pd
from pathlib import Path
from fastai.tabular import * 
from sklearn.metrics import roc_auc_score, roc_curve
from datetime import datetime

In [3]:
datapath = Path('data')

In [4]:
BATCH_SIZE = 256
np.random.seed(123)

### Create ROC metric class

In [5]:
class roc(Callback):
    '''
    ROC_AUC metric callback for fastai. Compute ROC score over each batch and returns the average over batches.
    TO DO: rolling average
    '''
    def on_epoch_begin(self, **kwargs):
        self.total = 0
        self.batch_count = 0
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        preds = F.softmax(last_output, dim=1)
        # roc_auc_score does not work on batches which does not contain both classes.
        try:
            roc_score = roc_auc_score(to_np(last_target), to_np(preds[:,1]))
            self.total += roc_score
            self.batch_count += 1
        except:
            pass
    
    def on_epoch_end(self, num_batch, **kwargs):
        self.metric = self.total/self.batch_count

### Read Training Data

In [None]:
df = pd.read_csv(datapath / 'train.csv')
df.head()

In [None]:
valid_idx = range(len(df)-20_000, len(df))
valid_idx

### Read Test Data

In [None]:
test_df = pd.read_csv(datapath / 'test.csv').set_index('ID_code')
test_df.head()

### Create DataBunch

In [None]:
data = TabularDataBunch.from_df(
    path = '.',
    df = df.drop(columns='ID_code'),
    dep_var = 'target',
    valid_idx = valid_idx,
    test_df = test_df, bs = BATCH_SIZE)

### Create and fit Model

In [None]:
learner = tabular_learner(data, layers=[200,100], metrics=[roc()], ps=.4)

In [None]:
learner.fit_one_cycle(1)

In [None]:
learner.lr_find()
learner.recorder.plot()

In [None]:
learner.fit_one_cycle(10, 1e-4)

In [None]:
prediction = learner.get_preds()

In [None]:
valid_auc_score = roc_auc_score(df.loc[valid_idx]['target'].values, prediction[0].numpy()[:,1])
valid_auc_score

In [None]:
fpr, tpr, _ = roc_curve(df.loc[valid_idx]['target'].values, prediction[0].numpy()[:,1])
plt.figure(figsize=(9,9))
plt.plot(fpr, tpr)
plt.show()

In [None]:
log_odds = np.log(prediction[0].numpy()[:,1] / prediction[0].numpy()[:,0])

In [None]:
_ = plt.hist(log_odds, 50)

### Test and submit

In [None]:
test_predicts, _ = learner.get_preds(ds_type=DatasetType.Test)

In [None]:
test_predicts.shape

In [None]:
test_df['target'] = test_predicts.numpy()[:,1]

In [None]:
# add timestamp to submission
now = datetime.now()
model_time = now.strftime("%Y%m%d-%H%M")

In [None]:
test_df[['target']].to_csv(f'submission_fastai_ensemble_{model_time}_{valid_auc_score}.csv')

### Data augmentation

In [None]:
df = pd.read_csv(datapath / 'train.csv')
test_df = pd.read_csv(datapath / 'test.csv').set_index('ID_code')

In [None]:
def augment_rows(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        x1 = x[y==1].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[:,c][ids]
        xs.append(x1)

    for i in range(t//2):
        x1 = x[y==0].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[:,c][ids]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [None]:
x_augmented, y_augmented = augment_rows(df.drop(columns=['ID_code', 'target']).to_numpy(), df['target'].to_numpy())
x_augmented.shape

In [None]:
augmented_df = DataFrame(data=x_augmented, columns=df.columns[2:])
augmented_df['target'] = np.array(y_augmented, dtype=int)
augmented_df.head()

In [None]:
columns = df.columns[2:]

def augment_columns(df):
    for feature in columns:
        df[f'sq_{feature}'] = df[feature]**2
#         df[f'repo_{feature}'] = df[feature].apply(lambda x: 0 if x==0 else 1/x)
#         df[f'repo_sq_{feature}'] = df[f'repo_{feature}']**2
        df[f'cube_{feature}'] = df[feature]**3
#         df[f'repo_cube_{feature}'] = df[f'repo_{feature}']**3

In [None]:
augment_columns(augmented_df)

In [None]:
augment_columns(test_df)

In [None]:
augmented_df.head()

In [None]:
test_df.head()

In [None]:
valid_idx = range(len(df)-40_000, len(df))
valid_idx

In [None]:
augmented_df.shape

In [None]:
valid_idx = range(0)
valid_idx

In [None]:
augmented_data = TabularDataBunch.from_df(
    path = '.',
    df = augmented_df,
    dep_var = 'target',
    valid_idx = valid_idx,
    test_df = test_df,
    bs = BATCH_SIZE)

### Create and fit Model

In [None]:
learner = tabular_learner(augmented_data, layers=[200,100], metrics=[accuracy, roc()], ps=.5)

In [None]:
learner.fit_one_cycle(1)

In [None]:
learner.lr_find()
learner.recorder.plot()

In [None]:
learner.fit_one_cycle(20, 3e-5)

In [None]:
prediction = learner.get_preds()

In [None]:
learner.save('augmented_rows_full_train')

In [None]:
valid_auc_score = roc_auc_score(df.loc[valid_idx]['target'].values, prediction[0].numpy()[:,1])
valid_auc_score

### Test and submit

In [None]:
test_predicts, _ = learner.get_preds(ds_type=DatasetType.Test)

In [None]:
test_predicts.shape

In [None]:
test_df['target'] = test_predicts.numpy()[:,1]

In [None]:
# add timestamp to submission
now = datetime.now()
model_time = now.strftime("%Y%m%d-%H%M")

In [None]:
csv_filename = f'submission_fastai_{model_time}_{valid_auc_score}.csv'
test_df[['target']].to_csv(csv_filename)

In [None]:
! kaggle competitions submit -f {csv_filename} -m "New score" santander-customer-transaction-prediction

In [None]:
csv_filename

## Load Model and predict

In [14]:
df = pd.read_csv(datapath / 'train.csv')
df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [15]:
valid_idx = range(0)
data = TabularDataBunch.from_df(
    path = '.',
    df = df.drop(columns='ID_code'),
    dep_var = 'target',
    valid_idx = valid_idx,
    test_df = df.drop(columns=['ID_code','target']))



In [19]:
learner = tabular_learner(data, layers=[200,100], ps=.5)

In [20]:
learner.load('augmented_rows_full_train')

Learner(data=TabularDataBunch;

Train: LabelList (200000 items)
x: TabularList
var_184 25.8398; var_119 7.2739; var_198 12.7803; var_171 -8.4254; var_2 11.9081; var_28 4.9306; var_8 -4.9200; var_36 0.6764; var_41 7.3746; var_100 9.4763; var_18 4.2840; var_152 9.0056; var_22 2.5791; var_11 3.1821; var_57 5.3822; var_142 19.0614; var_175 7.7423; var_54 -0.7474; var_113 -0.4851; var_104 14.7100; var_83 2.9423; var_138 2.8279; var_6 5.1187; var_94 14.8421; var_64 8.6748; var_192 3.1364; var_5 -9.2834; var_195 -2.3978; var_121 9.7268; var_130 12.8287; var_45 -7.0170; var_1 -6.7863; var_183 14.3691; var_35 10.5785; var_13 0.5745; var_59 10.1166; var_134 5.1726; var_165 12.6644; var_7 18.6266; var_167 -0.6975; var_98 2.1743; var_37 7.8871; var_109 24.3627; var_140 8.3307; var_38 4.6667; var_30 -0.3085; var_125 12.0039; var_29 5.9965; var_173 3.1531; var_149 17.9244; var_32 -3.8766; var_0 8.9255; var_55 14.8322; var_156 12.1749; var_82 1.3675; var_93 9.9913; var_123 0.7754; var_177 13.7241; va

In [22]:
test_predicts, _ = learner.get_preds(ds_type=DatasetType.Test)

In [23]:
test_predicts.shape

torch.Size([200000, 2])

In [None]:
class FastaiTabularClf:
    def __init__(self):
        