# Libraries

In [1]:
%%capture
!pip install kaggle

In [2]:
import pandas as pd
pd.set_option('mode.chained_assignment',  None)

from tqdm.notebook import tqdm; tqdm.pandas()
from glob import glob
import gc; gc.enable()

from google.colab import drive, output
output.enable_custom_widget_manager()
import json, os

ROOT = "/content/drive"     
drive.mount(ROOT)     
DATA_PATH  = ROOT + '/MyDrive/Kaggle/AMEX_DEFAULT/data'
_api_key_path = ROOT + "/MyDrive/Kaggle/kaggle.json"

with open(_api_key_path) as f:
    api = json.load(f)
    os.environ["KAGGLE_USERNAME"] = api["username"]
    os.environ["KAGGLE_KEY"] = api["key"]

Mounted at /content/drive


In [None]:
!kaggle datasets download -d odins0n/amex-parquet
!unzip -o /content/amex-parquet.zip
!rm -rf /content/amex-parquet.zip

Downloading amex-parquet.zip to /content
100% 8.64G/8.65G [03:19<00:00, 84.9MB/s]
100% 8.65G/8.65G [03:19<00:00, 46.6MB/s]
Archive:  /content/amex-parquet.zip
  inflating: test_data.parquet       
  inflating: train_data.parquet      


# Merge Train Label

In [None]:
train = pd.read_parquet('train_data.parquet').drop('target', axis=1); gc.collect()

train_label = pd.read_csv(f'{DATA_PATH}/train_labels.csv')
train_label = train_label[train_label['target'] == 1]
label_true = set(train_label['customer_ID'].values.tolist()) # set is much faster

LABEL = []
grpby = train.groupby('customer_ID')['P_2'] # choose random column
for idx, g in tqdm(grpby, total=len(train['customer_ID'].unique())):
    if idx in label_true:
        OUT = [0] * (len(g) - 1) + [1]
    else:
        OUT = [0] * len(g)
    
    LABEL.extend(OUT)

train['target'] = LABEL
train['target'] = train['target'].astype('int8')
del train_label, label_true, LABEL, grpby, idx, g, OUT; gc.collect()

  0%|          | 0/458913 [00:00<?, ?it/s]

18

## Set Features

In [None]:
featCat = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
featCont = [x for x in train.columns if x not in featCat + ['customer_ID', 'target', 'S_2']]
featAll = featCont + featCat

# Split Files + Label Encoding

## Define Function

In [None]:
def uncategorize(col):
    if col.dtype.name == 'category':
        try:
            return col.astype(col.cat.categories.dtype)
        except:
            # In case there is pd.NA (pandas >= 1.0), Int64 should be used instead of int64
            return col.astype(col.cat.categories.dtype.name.title())           
    else:
        return col

In [None]:
def divideData(df, continuous, label_dict, n_chunks=6, fileType=None):
    ids = df['customer_ID'].unique()
    len_ids = len(ids)
    chunk_size = len_ids // (n_chunks)
    
    subset_size = 0
    for i in range(n_chunks+1):
        subset_ids = ids[i*chunk_size:(i+1)*chunk_size]
        subset = df.loc[df['customer_ID'].isin(subset_ids)]
        
        if not len(subset_ids):
            del subset, subset_ids
            break
        else:
            
            print(f'file {i+1} has :: {len(subset_ids)} IDs')
                
            for c in continuous:
                subset[c] = subset[c].round(2).astype('float32')
            
            for feat, mapping in label_dict.items():
                if fileType == 'test':
                    subset = subset.apply(uncategorize, axis=0)
                
                subset[feat] = subset[feat].fillna(-12345.0).astype(str).map(mapping)
            
            subset[list(label_dict.keys())] = subset[list(label_dict.keys())].astype('int8')
            
            subset.reset_index(drop=True).to_parquet(f'{fileType}_{i+1}.parquet')

            subset_size += len(subset)
            
            del subset, subset_ids
    
    print(subset_size)
    
    return

## Create Label Dictionary (in Dictionary)

In [None]:
from pprint import pprint

test = pd.read_parquet('test_data.parquet', columns = featCat)
test = test.apply(uncategorize, axis=0)

label_dict = dict()

for f in featCat:
    map_dict = dict() 
    tr_vals = set(train[f].fillna(-12345.0).astype(str).unique().tolist())
    tt_vals = set(test[f].fillna(-12345.0).astype(str).unique().tolist())
    
    vals = tr_vals.union(tt_vals); del tr_vals, tt_vals; gc.collect()

    for i, v in enumerate(list(vals)):
        map_dict[v] = int(i)
    
    label_dict[f] = map_dict

pprint(label_dict)

del test; gc.collect()

{'B_30': {'-12345.0': 1, '0.0': 3, '1.0': 0, '2.0': 2},
 'B_38': {'-12345.0': 3,
          '1.0': 1,
          '2.0': 4,
          '3.0': 7,
          '4.0': 6,
          '5.0': 0,
          '6.0': 5,
          '7.0': 2},
 'D_114': {'-12345.0': 2, '0.0': 1, '1.0': 0},
 'D_116': {'-12345.0': 2, '0.0': 1, '1.0': 0},
 'D_117': {'-1.0': 2,
           '-12345.0': 3,
           '1.0': 1,
           '2.0': 5,
           '3.0': 7,
           '4.0': 6,
           '5.0': 0,
           '6.0': 4},
 'D_120': {'-12345.0': 2, '0.0': 1, '1.0': 0},
 'D_126': {'-1.0': 1, '-12345.0': 2, '0.0': 3, '1.0': 0},
 'D_63': {'CL': 0, 'CO': 1, 'CR': 4, 'XL': 5, 'XM': 3, 'XZ': 2},
 'D_64': {'-1': 1, '-12345.0': 0, 'O': 4, 'R': 3, 'U': 2},
 'D_66': {'-12345.0': 2, '0.0': 1, '1.0': 0},
 'D_68': {'-12345.0': 2,
          '0.0': 6,
          '1.0': 1,
          '2.0': 4,
          '3.0': 7,
          '4.0': 5,
          '5.0': 0,
          '6.0': 3}}


0

## Train Data

In [None]:
divideData(train, continuous=featCont, label_dict=label_dict, n_chunks=6, fileType='train',)
del train; gc.collect(); os.remove('train_data.parquet')

file 1 has :: 76485 IDs
file 2 has :: 76485 IDs
file 3 has :: 76485 IDs
file 4 has :: 76485 IDs
file 5 has :: 76485 IDs
file 6 has :: 76485 IDs
file 7 has :: 3 IDs
5531451


0

## Test Data

In [None]:
test = pd.read_parquet('test_data.parquet')
divideData(test, continuous=featCont, label_dict=label_dict, n_chunks=6, fileType='test',)
del test; gc.collect(); os.remove('test_data.parquet')

file 1 has :: 154103 IDs
file 2 has :: 154103 IDs
file 3 has :: 154103 IDs
file 4 has :: 154103 IDs
file 5 has :: 154103 IDs
file 6 has :: 154103 IDs
file 7 has :: 3 IDs
11363762


0

# Interpolate

In [3]:
trainFiles = glob(f'{DATA_PATH}/raw/train*')
testFiles = glob(f'{DATA_PATH}/raw/test*')

In [5]:
train = pd.read_parquet(trainFiles[0])

featCat = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
featCont = [x for x in train if x not in featCat + ['customer_ID', 'target', 'S_2']]
featAll = ['customer_ID', 'target', 'S_2'] + featCont + featCat

del train; gc.collect()

63

In [6]:
def getFile(filePath, features):
    F = pd.read_parquet(filePath, columns=features)
    return F

In [7]:
def interpolateData(filePaths, features, continuous, fileType):

    for i, filePath in enumerate(filePaths):
        TMP = pd.DataFrame()

        print("READ FILES....")

        F = getFile(filePath=filePath, features=features)
        TMP = pd.concat([F, TMP], axis=0)
        del F; gc.collect()

        print("INTERPOLATE....")
        TMP[continuous] = TMP.groupby('customer_ID')[continuous].progress_apply(pd.DataFrame.interpolate).reset_index(drop=True)
        
        TMP.to_parquet(f'{DATA_PATH}/raw_interpolated/{fileType}_{i+1}.parquet')

        del TMP; gc.collect()

    return  

In [8]:
interpolateData(filePaths=trainFiles, 
           features=featAll, 
           continuous=featCont, 
           fileType='train')

READ FILES....
INTERPOLATE....


  0%|          | 0/76485 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/76485 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/76485 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/76485 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/76485 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/76485 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
interpolateData(filePaths=testFiles, 
           features=[x for x in featAll if x not in ['target']], 
           continuous=featCont, 
           fileType='test')

READ FILES....
INTERPOLATE....


  0%|          | 0/154103 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/154103 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/154103 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/154103 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/154103 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/154103 [00:00<?, ?it/s]

READ FILES....
INTERPOLATE....


  0%|          | 0/3 [00:00<?, ?it/s]