In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import regex as re
import gc
import matplotlib.pyplot as plt
import pyarrow.feather as feather
import seaborn as sns
import datatable as dt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet
/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


### Data preparation

Taddar for parquet dataset from [here](https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format)

In [2]:
preData  = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet')
preLabel  = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv', low_memory=True)

In [3]:
def process_data(data):
    # Data cleaning: remove >50% na
    cols_with_50pc_missing = [col for col in data.columns if data[col].isna().sum() >  0.5*len(data.index)]
    data = data.drop(columns=cols_with_50pc_missing)

    # Feature engineering: number of transactions
    numTx = data['customer_ID'].value_counts().tolist()
    data = data.groupby('customer_ID').tail(1)
    data = data.reset_index(drop=True)
    data.insert(2,"numTx",numTx)
    data = data.drop(columns=["S_2", "customer_ID"])

    # Data imputation: Fill up na
    for i in data.columns:
        data[i] = data[i].fillna(data[i].mean())
    return data

In [4]:
preData = process_data(preData)

### Training

In [5]:
# Train test split
X_train, X_val, y_train, y_val = train_test_split(preData,preLabel['target'],test_size=0.2, random_state=42)

In [6]:
del preData, preLabel
gc.collect()

42

### CatBoost

In [7]:
# # iter = 100, 18 seconds, 3000+GPU = 2 mins
import catboost as cat
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
clf = CatBoostClassifier(
    iterations=3000,
    task_type="GPU",
    bagging_temperature = 0.2
)
clf.fit(X_train,y_train,eval_set=(X_val,y_val),verbose=True)


Learning rate set to 0.027352
0:	learn: 0.6593681	test: 0.6592803	best: 0.6592803 (0)	total: 22ms	remaining: 1m 5s
1:	learn: 0.6294125	test: 0.6293440	best: 0.6293440 (1)	total: 36ms	remaining: 54s
2:	learn: 0.6003776	test: 0.6003034	best: 0.6003034 (2)	total: 50.3ms	remaining: 50.2s
3:	learn: 0.5752575	test: 0.5751640	best: 0.5751640 (3)	total: 64.7ms	remaining: 48.4s
4:	learn: 0.5528975	test: 0.5528188	best: 0.5528188 (4)	total: 78.9ms	remaining: 47.3s
5:	learn: 0.5321532	test: 0.5320083	best: 0.5320083 (5)	total: 93.4ms	remaining: 46.6s
6:	learn: 0.5129108	test: 0.5127253	best: 0.5127253 (6)	total: 108ms	remaining: 46s
7:	learn: 0.4931580	test: 0.4930032	best: 0.4930032 (7)	total: 122ms	remaining: 45.5s
8:	learn: 0.4766088	test: 0.4764673	best: 0.4764673 (8)	total: 136ms	remaining: 45.2s
9:	learn: 0.4612708	test: 0.4611061	best: 0.4611061 (9)	total: 150ms	remaining: 45s
10:	learn: 0.4477580	test: 0.4475596	best: 0.4475596 (10)	total: 165ms	remaining: 44.8s
11:	learn: 0.4342471	test:

<catboost.core.CatBoostClassifier at 0x7f562cc93950>

In [8]:
prediction = clf.predict_proba(X_val)
rounded_predictions = np.argmax(prediction, axis=-1)
c_matrix = confusion_matrix(y_val,rounded_predictions)
dt_acc = c_matrix.trace()/c_matrix.sum()
print(c_matrix)
print(dt_acc)

[[63832  4408]
 [ 4586 18957]]
0.9020079971236503


In [9]:
#cleanup
del X_train, X_val, y_train, y_val
gc.collect()

42

### Final validation

In [10]:
testData = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet')
testCustomer  = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv', usecols=['customer_ID'], low_memory=True)

### Data preparation

In [11]:
testData = process_data(testData)

### Final Prediction

In [12]:
prediction = clf.predict_proba(testData)
final_predictions = prediction[:,1]

In [13]:
output = pd.DataFrame({'customer_ID': testCustomer.customer_ID, 'prediction': final_predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [14]:
value, count = np.unique(final_predictions,return_counts=True)
print("Final predictions",final_predictions, final_predictions.mean())

Final predictions [0.01362965 0.00240108 0.03969507 ... 0.45161428 0.27881108 0.09665481] 0.2554975732533409
