# SET cudf

In [None]:
%%capture
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/env-check.py

# This will update the Colab environment and restart the kernel.  Don't run the next cell until you see the session crash.
!bash rapidsai-csp-utils/colab/update_gcc.sh
import os
os._exit(00)

In [None]:
# This will install CondaColab.  This will restart your kernel one last time.  Run this cell by itself and only run the next cell once you see the session crash.
import os
import condacolab
condacolab.install()
os._exit(00)

⏬ Downloading https://github.com/jaimergp/miniforge/releases/latest/download/Mambaforge-colab-Linux-x86_64.sh...
📦 Installing...


In [1]:
%%capture
# you can now run the rest of the cells as normal
import condacolab
condacolab.check()
# Installing RAPIDS is now 'python rapidsai-csp-utils/colab/install_rapids.py <release> <packages>'
# The <release> options are 'stable' and 'nightly'.  Leaving it blank or adding any other words will default to stable.
!python rapidsai-csp-utils/colab/install_rapids.py stable
import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
os.environ['CONDA_PREFIX'] = '/usr/local'
!pip install cffi==1.15.1

# Libraries

In [2]:
import pandas as pd; pd.set_option('mode.chained_assignment',  None)
import numpy as np
import cudf

from sklearn.preprocessing import QuantileTransformer

from tqdm.notebook import tqdm; tqdm.pandas()
from glob import glob
import gc; gc.enable()

from google.colab import drive, output
output.enable_custom_widget_manager()
import json, os

ROOT = "/content/drive"     
drive.mount(ROOT)     
DATA_PATH  = ROOT + '/MyDrive/Kaggle/AMEX_DEFAULT/data/'

Mounted at /content/drive


In [3]:
trainFiles = glob(f'{DATA_PATH}/raw_interpolated/train*')
testFiles = glob(f'{DATA_PATH}/raw_interpolated/test*')

In [4]:
sampleFile = pd.read_parquet(trainFiles[0])

featCat = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
featCont = [x for x in sampleFile.columns if x not in featCat + ['customer_ID', 'target', 'S_2']]

del sampleFile; gc.collect()

0

# Preprocess

## Read File 

In [5]:
def getFile(filePath, continuous, categoricals):
    
    F = pd.read_parquet(filePath)
    
    for c in continuous:
        F[c] = F[c].round(2)

    F['year'] = F['S_2'].apply(lambda row: int(row.split('-')[0]))
    F['month'] = F['S_2'].apply(lambda row: int(row.split('-')[1]))

    F = cudf.DataFrame(F)
    return F

## Agg.

In [6]:
def preprocess(df, continuous, categoricals):
    
    agg_num = df.groupby('customer_ID')[continuous].agg(['first', 'last', 'min', 'max', 'sum', 'mean', 'std']).astype('float32')
    agg_num.columns = ['_'.join(x) for x in agg_num.columns]
    agg_num.reset_index(inplace=True)

    agg_cat = df.groupby('customer_ID')[categoricals].agg(['first', 'last', 'nunique']).astype('int8')
    agg_cat.columns = ['_'.join(x) for x in agg_cat.columns]
    agg_cat.reset_index(inplace=True)

    output = cudf.merge(left=agg_num, right=agg_cat, how='left', on='customer_ID')
    del agg_num, agg_cat

    gc.collect()

    return output

## Drive Function

In [7]:
def driveFunction(filePaths, continuous, categoricals):
    isFirst = True
    while filePaths:
        tmpFile = getFile(filePaths.pop(0), continuous, categoricals)
        tmpAgg = preprocess(tmpFile, continuous, categoricals); del tmpFile; gc.collect()
        if isFirst:
            Agg = tmpAgg
            isFirst = False
        else:
            Agg = cudf.concat([tmpAgg, Agg], axis=0)
        del tmpAgg; gc.collect()
    return Agg

# Run

In [8]:
train = driveFunction(trainFiles, featCont, featCat)
train = train.to_pandas(); train.to_parquet(f'{DATA_PATH}/agg/train.parquet')
del train; gc.collect()

test = driveFunction(testFiles, featCont, featCat)
test = test.to_pandas(); test.to_parquet(f'{DATA_PATH}/agg/test.parquet')
del test; gc.collect()

0

## Conditional Probability
* first value to last value



In [9]:
train = pd.read_parquet(f'{DATA_PATH}/agg/train.parquet')
test = pd.read_parquet(f'{DATA_PATH}/agg/test.parquet')

In [10]:
def getCondProb(train, test, categoricals):
    
    for c in tqdm(list(set(['_'.join(f.split('_')[:2]) for f in categoricals]))):

        first_probs = train[f"{c}_first"].value_counts(normalize=True).round(5).astype('float32').to_dict()
        last_probs = train[f"{c}_last"].value_counts(normalize=True).round(5).astype('float32').to_dict()
        
        for df in [train, test]:
            df[f"{'_'.join(c.split('_')[:2])}_COND_PROB"] = \
            (df[f"{c}_first"].map(first_probs) * df[f"{c}_last"].map(last_probs)).astype('float32')

        del first_probs, last_probs
        gc.collect()

    return train, test

In [11]:
train, test = getCondProb(train, test, featCat)

  0%|          | 0/11 [00:00<?, ?it/s]

# Last - Mean Gap

In [12]:
def getMeanLast(df, continuous):

    for c in tqdm(list(set(['_'.join(f.split('_')[:2]) for f in continuous]))):
        df[f"{'_'.join(c.split('_')[:2])}_mean_last"] = \
        df[f"{'_'.join(c.split('_')[:2])}_last"] - df[f"{'_'.join(c.split('_')[:2])}_mean"]
        df[f"{'_'.join(c.split('_')[:2])}_mean_last"] = df[f"{'_'.join(c.split('_')[:2])}_mean_last"].astype('float32')
        gc.collect()
        
    return df

In [13]:
DerCont = [x for x in train.columns if '_'.join(x.split('_')[:2]) in featCont]

train = getMeanLast(train, DerCont)
test = getMeanLast(test, DerCont)

  0%|          | 0/177 [00:00<?, ?it/s]

  """


  0%|          | 0/177 [00:00<?, ?it/s]

In [14]:
scaler = QuantileTransformer()
train[DerCont] = scaler.fit_transform(train[DerCont])
test[DerCont] = scaler.transform(test[DerCont])

# Add target

In [15]:
label = pd.read_csv(f'{DATA_PATH}/train_labels.csv')
label = label.groupby('customer_ID').last()
label.reset_index(inplace=True)

train = pd.read_parquet(f'{DATA_PATH}/agg/train.parquet')
train = pd.merge(left=train, right=label, on='customer_ID', how='left'); gc.collect()

19

# Save

In [16]:
train.to_parquet(f'{DATA_PATH}/agg/train.parquet')
test.to_parquet(f'{DATA_PATH}/agg/test.parquet')