In [77]:
# use this library for analysis
import pandas as pd

# use these libraries to create and save train/test files
from pathlib import Path 
from sklearn.model_selection import train_test_split

# use these libraries for the creation of the pipeline and transformer types
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer # this is cool. Let's you impute missing values automatically
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 

In [78]:
# # We need to create a path variable to access and save data. This is also a better way to work with files/folders than just typing in the path. 
# data_dir = Path("..",'Default_Credit_Card_Clients')
# data_path = Path(data_dir,'default of credit card clients.csv')
# df = pd.read_csv(data_path, header=1,index_col = 'ID')

# commented out the above because it is having trouble without integration betweent drive, colab, and github.
# when I am ready to do that here is a great guide to do that:
# https://towardsdatascience.com/google-drive-google-colab-github-dont-just-read-do-it-5554d5824228

In [79]:
url = 'https://raw.githubusercontent.com/johnsovo44/Classifier_Models_And_Credit_Cards_Defaults/master/default%20of%20credit%20card%20clients.csv'

# recreating the column names for easier access
columns = 'id limit_balance sex education marriage age pay_1 pay_2 pay_3 pay_4 pay_5 pay_6 bill_amt_1 \
bill_amt_2 bill_amt_3 bill_amt_4 bill_amt_5 bill_amt_6 pay_amt_1 pay_amt_2 pay_amt_3 pay_amt_4 \
pay_amt_5 pay_amt_6 default'.split()

# setting the initial dataframe
df = pd.read_csv(url, header=1)
df.columns = columns
df = df.drop('id',axis=1)
df.head()

# created a pandas dataframe by copying the csv url directly from my github repo

Unnamed: 0,limit_balance,sex,education,marriage,age,pay_1,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt_1,bill_amt_2,bill_amt_3,bill_amt_4,bill_amt_5,bill_amt_6,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,pay_amt_5,pay_amt_6,default
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [80]:
print(df.shape)
print(df.columns)

(30000, 24)
Index(['limit_balance', 'sex', 'education', 'marriage', 'age', 'pay_1',
       'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6', 'bill_amt_1', 'bill_amt_2',
       'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6', 'pay_amt_1',
       'pay_amt_2', 'pay_amt_3', 'pay_amt_4', 'pay_amt_5', 'pay_amt_6',
       'default'],
      dtype='object')


In [81]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   limit_balance  30000 non-null  int64
 1   sex            30000 non-null  int64
 2   education      30000 non-null  int64
 3   marriage       30000 non-null  int64
 4   age            30000 non-null  int64
 5   pay_1          30000 non-null  int64
 6   pay_2          30000 non-null  int64
 7   pay_3          30000 non-null  int64
 8   pay_4          30000 non-null  int64
 9   pay_5          30000 non-null  int64
 10  pay_6          30000 non-null  int64
 11  bill_amt_1     30000 non-null  int64
 12  bill_amt_2     30000 non-null  int64
 13  bill_amt_3     30000 non-null  int64
 14  bill_amt_4     30000 non-null  int64
 15  bill_amt_5     30000 non-null  int64
 16  bill_amt_6     30000 non-null  int64
 17  pay_amt_1      30000 non-null  int64
 18  pay_amt_2      30000 non-null  int64
 19  pay_

The column names are pretty confusing at the moment. For instance where is pay one (PAY_1). What is the difference between Pay and Pay Amt, as well as Bill amount and Pay amount. Seems like Bill amount is how much the credit card company is charging for the month, while Pay amut is how much they paid for the billing cycle. This is just a guess.

In [82]:
df.describe()

Unnamed: 0,limit_balance,sex,education,marriage,age,pay_1,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt_1,bill_amt_2,bill_amt_3,bill_amt_4,bill_amt_5,bill_amt_6,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,pay_amt_5,pay_amt_6,default
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,51223.3309,49179.075167,47013.15,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,73635.860576,71173.768783,69349.39,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3558.75,2984.75,2666.25,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,22381.5,21200.0,20088.5,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,67091.0,64006.25,60164.75,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


Right off the bat you will notice that it is difficult to interpret the data without a data dicitionary. Contemplating if it makes sense to create a readable version of the dataframe or just make the data dictionary present within the notebook. 


# Train Test Split

The dataset does not come with a training set and a testing set. Before we begin let's create that.

In [83]:
# next split into train and test
train, test = train_test_split(df, test_size=0.3,shuffle=True, random_state=42)

In [84]:
# first split data into labels and features
X = train.drop('default', axis=1)
y = train.default

x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = .3)

# Q: I wonder how normal this is to use train test split twice? I used it to
# create a train and test set, then I used it to make the train and test set for
# X and y. Is that normal if the dataset does not come with a train and test set

In [85]:
print(f"Target Training: {y_train.shape}")
print(f"Target Test {y_test.shape}")
print(f"Features Training {x_train.shape}")
print(f"Features Test {x_test.shape}")

Target Training: (14700,)
Target Test (6300,)
Features Training (14700, 23)
Features Test (6300, 23)


## Model Pipeline

For the first pass of the information I am just going to jump right into the model and create a pipeline. Let's see what it spits out without any tuning, cleaning or feature engineering/selection. 

Creating pipeline has some great benefits:


*   workflow is easy to read
*   keep everything in order
*   make work reproducible

Here is the guide I will be using for the creation of the pipeline by Rebecca Vickery:
https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf



In [86]:
numeric_transformer = Pipeline(steps =[
                                       ('imputer', SimpleImputer(strategy='mean')),
                                       ('scaler', StandardScaler()) 

])

# Now we have the ability to transformer numberical variables and categorical.
# I'm sure there are more steps that can be taken in cleaning the data but for
# now we will keep it as. We don't have categorical datatypes so not including.

In [88]:
numeric_features = train.select_dtypes(include = ['int64']).drop(['default'], axis = 1).columns
# we don't want to include our y variable. Not sure if that is correct now, but
# will see first how things turn out.

preprocessor = ColumnTransformer(transformers = [
                   ('num', numeric_transformer, numeric_features)
    ]
)
# we've used the ColumnTransformer to apply these transformers to each column.

In [89]:
# Test
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(steps=[
                     ('preprocessor',preprocessor),
                     ('classifier', RandomForestClassifier())
])
rf.fit(x_train, y_train)
y_preds = rf.predict(x_test)