# Amex Default Prediction Competition

## This competition is meant to determine the card account owners that are most likely to default on their Amex Card account

In keeping with the anonymous principals of this type of data exchange, the idenities as well as any specifics in their spending is hidden; encoded away.

### Initial Data input

Of course, without this step there would be no data to make any models off of.

In [63]:
import pandas as pd

dataFolder = "./amex-default-prediction/"
categorical_variables = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
XTrainSegmentList = []

X_test_segment = pd.read_csv('./amex-default-prediction/test_data.csv', nrows=1000, cache_dates=True)
X_train_segment = pd.read_csv('./amex-default-prediction/train_data.csv', nrows=1000, cache_dates=True)

X_train_segment.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,...,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,...,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,...,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,...,,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,...,,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827


## Adding train labels

In [53]:
trainingLabels = pd.read_csv('./amex-default-prediction/train_labels.csv')

X_train_segment = X_train_segment.merge(trainingLabels)

### Data cleaning 

Now that the data (or at least segments of the data) have been read, it is time to clean it

In [54]:
import seaborn as sns
import matplotlib as plt
import numpy as np

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

## First Model Run

In [55]:
from xgboost import XGBClassifier

model = XGBClassifier()

## Feature selection time

### Hypothesis 1: Mutual Information between spending and balance

Obviously, the amount spent will have a correlation on balance. Therefore, there should be a decent correlation between spending and balance variables.

In [56]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import mutual_info_regression

# Utility functions from Tutorial
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")



In [66]:
def filter_data(feature_list, dataframe):
    for feat in feature_list:
        dataframe.drop(list(dataframe.filter(regex = feat)), axis = 1, inplace = True)

In [67]:
filter_data(["B_*"], X_train_segment)

In [68]:
X_train_segment

Unnamed: 0,customer_ID,S_2,P_2,D_39,R_1,S_3,D_41,D_42,D_43,D_44,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.009228,0.124035,0.008771,,,0.000630,...,,,,0.002427,0.003706,0.003818,,0.000569,0.000610,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.006151,0.126750,0.000798,,,0.002526,...,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.954180,0.091505,0.006815,0.123977,0.007598,,,0.007605,...,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.001373,0.117169,0.000685,,,0.006406,...,,,,0.006117,0.004516,0.003200,,0.008419,0.006527,0.009600
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.007605,0.117325,0.004653,,,0.007731,...,,,,0.003671,0.004946,0.008889,,0.001670,0.008126,0.009827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,000cfb5aac8db50185898fa111be535e4530149f341a9a...,2017-10-22,0.672121,0.210879,0.006978,0.164024,0.005170,,0.137652,0.256465,...,,,,0.001158,0.007625,0.004627,,0.005460,0.005069,0.007680
996,000cfb5aac8db50185898fa111be535e4530149f341a9a...,2017-11-06,0.686063,0.502023,0.505675,0.159494,0.000384,,0.093951,0.254935,...,,,,0.006789,0.008953,0.002706,,0.006975,0.004687,0.003514
997,000cfb5aac8db50185898fa111be535e4530149f341a9a...,2017-12-11,0.696946,0.624237,0.000166,0.159524,0.003015,,0.082961,0.132943,...,,,,0.000037,0.001151,0.002379,,0.003354,0.004381,0.007556
998,000cfb5aac8db50185898fa111be535e4530149f341a9a...,2018-01-01,0.762449,0.325978,0.002852,0.155224,0.003925,,0.052297,0.001903,...,,,,0.004689,0.001753,0.008883,,0.004239,0.006569,0.008146
