In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Autogluon model
This notebook is to carry out testing for the Amex Default Prediction competition using AutoGluon as a starting AutoML modeling technique.

We are creating a notebook specifically for testing to avoid memory issues trying to do everything in one notebook.

In [3]:
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from IPython.display import Image, display_svg, SVG
#from dtreeviz.trees import *
from autogluon.tabular import TabularDataset, TabularPredictor
import gc

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

In [4]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df)

In [5]:
PATH="amex-data-integer-dtypes-parquet-format/"

In [6]:
# Read the testing dataset provided
test_data_df = pd.read_parquet(f'{PATH}/test.parquet')

In [7]:
test_data_df.shape

(11363762, 190)

In [9]:
# fill the missing values
test_data_df = test_data_df.bfill(axis='rows').ffill(axis='rows')
test_data_df.reset_index(inplace=True, drop=True)

In [11]:
test_data_df.sample(4)

Unnamed: 0,customer_ID,S_2,P_2,D_39,...,D_142,D_143,D_144,D_145
10894405,f56fdbc843bc02ad4dbc20e61b493d6905126d692eec79bb1e18bb63e61a4b76,2018-11-27,0.87459,21,...,0.576354,0,0.002064,0
9066074,cc3a057fe61c68bcc9865ca319226e2ea1a372769b3115ea0cb10588cc93106f,2018-11-23,0.833374,0,...,0.826212,0,0.003584,0
2459606,376a87a3d47e34a1d40b0c026fe5a7a7c801c086b13a5479782d0b9cb48f4f84,2018-11-29,1.0052,0,...,0.595818,1,0.250139,2
31397,00b513cf1a88a057c5065dc286dcb01a56a831978ea09de8b61c4b41834d0738,2018-09-18,0.155348,0,...,0.062282,1,0.005912,2


In [12]:
# Select the latest customer statement
test_data_df = test_data_df.groupby(['customer_ID'],as_index=False).tail(1)

In [13]:
test_data_df.reset_index(drop=True, inplace=True)

In [14]:
test_data_df.shape

(924621, 190)

In [15]:
test_data_df.columns

Index(['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41',
       'B_3',
       ...
       'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143',
       'D_144', 'D_145'],
      dtype='object', length=190)

## Data Prep

In [16]:
# Convert specific cols to categorical variables
obj_col = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
for col in obj_col:
    test_data_df[col]=test_data_df[col].astype('int').astype('str')
    print(test_data_df[col].unique())

['0' '1' '2' '-1']
['2' '3' '6' '1' '7' '5' '4' '-1']
['0' '1' '-1']
['0' '-1' '1']
['0' '4' '5' '3' '7' '6' '-1' '2']
['1' '0' '-1']
['1' '2']
['0' '3' '4' '2' '1' '5']
['3' '0' '2' '-1']
['-1' '1']
['6' '4' '5' '2' '1' '3' '-1']


## Make predictions
Load the best model from training which is saved on disk and then carry out the predictions by chunking the test datasets (since it is large).

In [41]:
best_predictor = TabularPredictor.load("AutogluonModels/ag-20220924_172812/")
predictor = TabularPredictor.load("AutogluonModels/ag-20220924_172812/")

In [42]:
predictor.get_model_best()

'WeightedEnsemble_L2_FULL'

In [None]:
display_all(predictor.leaderboard(extra_info=True, silent=True))

In [None]:
predictor.leaderboard(extra_info=True, silent=True)

## Making a submissions file

In [43]:
_ = gc.collect()

In [44]:
customers = test_data_df[['customer_ID']].drop_duplicates().sort_index().values.flatten()
print(f'Total number of customers: {len(customers)}')

Total number of customers: 924621


In [45]:
# Here we select 250 chunks arbitrarily. 
n = 250
list_df = np.array_split(test_data_df, n)

In [46]:
len(list_df), len(list_df[1])

(250, 3699)

In [47]:
# Iterate through the chunks and generate prediction probabilities for column with target = 1 (chance of default)
y_pred = []
y_true = []
for i in range(0,n):
    df_chunk = list_df[i]
    y_pred.append(best_predictor.predict_proba(df_chunk).iloc[:, 1:3])
test_preds = pd.concat(y_pred, axis=0, ignore_index=True)

In [48]:
test_preds

Unnamed: 0,1
0,0.060657
1,0.030068
2,0.116480
3,0.349162
4,0.864032
...,...
924616,0.085329
924617,0.693495
924618,0.394760
924619,0.211188


In [49]:
# Set column header for final submissions file
test_preds.columns = ['prediction']
test_preds.head(3)

Unnamed: 0,prediction
0,0.060657
1,0.030068
2,0.11648


In [50]:
print(len(test_preds),len(test_data_df))

924621 924621


In [51]:
# Concat predictions with test set
test_customers_df = test_data_df[['customer_ID']].copy()
test_customers_df.reset_index(drop=True, inplace=True)
test_preds.reset_index(drop=True, inplace=True)
test_preds_df = pd.DataFrame()
test_preds_df = pd.concat([test_customers_df,test_preds], axis=1)

In [52]:
gc.collect()

4490

In [53]:
test_preds_df.shape

(924621, 2)

In [54]:
test_preds_df.to_csv("final_submission.csv",index=False)

In [55]:
test_preds_df.head(4)

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.060657
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.030068
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.11648
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.349162


In [56]:
!kaggle competitions submit -c amex-default-prediction -f submission_ag_v10.csv -m "Submission v10"

100%|██████████████████████████████████████| 74.9M/74.9M [00:05<00:00, 13.2MB/s]
Successfully submitted to American Express - Default Prediction