In [None]:
# import the appropriate libraries
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# load the CSV file taken from  https://www.kaggle.com/arjunbhasin2013/ccdata
df = pd.read_csv("~/Desktop/CC GENERAL.csv")

# Data dictionary
https://www.kaggle.com/arjunbhasin2013/ccdata

<ul>
<li>CUST_ID : Identification of Credit Card holder (Categorical)</li>
<li>BALANCE : Balance amount left in their account to make purchases</li>
<li>BALANCE_FREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)</li>
<li>PURCHASES : Amount of purchases made from account</li>
<li>ONEOFF_PURCHASES : Maximum purchase amount done in one-go</li>
<li>INSTALLMENTS_PURCHASES : Amount of purchase done in installment</li>
<li>CASH_ADVANCE : Cash in advance given by the user</li>
<li>PURCHASES_FREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)</li>
<li>ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)</li>
<li>PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)</li>
<li>CASHADVANCEFREQUENCY : How frequently the cash in advance being paid
<li>CASHADVANCETRX : Number of Transactions made with "Cash in Advanced"</li>
<li>PURCHASES_TRX : Numbe of purchase transactions made</li>
<li>CREDIT_LIMIT : Limit of Credit Card for user</li>
<li>PAYMENTS : Amount of Payment done by user</li>
<li>MINIMUM_PAYMENTS : Minimum amount of payments made by user</li>
<li>PRCFULLPAYMENT : Percent of full payment paid by user</li>
<li>TENURE : Tenure of credit card service for user</li>
</ul>

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-', '_')
df = df.dropna()
df.columns
df

In [None]:
# let's explore the variables to see what might be interesting
# are there any variables that need to be transformed because
# they are not normally distributed?  (e.g. balance, purchases)
df['ln_balance'] = np.log(df['balance'] + 1)
df['ln_purchases'] = np.log(df['purchases'] + 1)
plt.scatter(df['ln_balance'], df['ln_purchases'])
plt.show()

In [None]:
# change one of the variables to a binary variable to put
# the users into two buckets
# do a box plot of how these two buckets are different
df['paid_in_full'] = np.where(df['installments_purchases']==0, 1, 0)
# boxplot = df.boxplot(column=['purchases_frequency'], by='paid_in_full')
# plt.scatter(df['purchases_frequency'],df['tenure'])
col = np.where(df['paid_in_full'] ==1,'r',np.where(df['paid_in_full']==0,'b','r'))
plt.scatter(df['ln_balance'], df['ln_purchases'], c=col)
plt.show()

In [None]:
# we now want to take the full sample and build and test a model
# accordingly, we need to randomly take just 80% of the sample
# to help train the model and then have 20% of the sample for a
# testing sample; the way we will do this is to create a random variable
# in the full dataframe and then take 80% of the dataframe and put
# it into a training dataframe and put the remaining 20% of the full
# dataframe into a testing dataframe
df['mask'] = [np.random.uniform(0,1)  for k in df.index]
train = df[df['mask'] < 0.8]
test = df[df['mask']>= 0.8]
test

In [None]:
# let's create a logistic regression trying to predict our dependent
# binary variable
YTrain = train[['paid_in_full']]
XTrain = train[['ln_balance', 'ln_purchases']]
LogisticModel = sm.Logit(YTrain, XTrain).fit()
print(LogisticModel.summary())

In [None]:
# we now use this model to help examine our testing set
YTest = test[['paid_in_full']]
XTest = test[['ln_balance', 'ln_purchases']]
YPred = LogisticModel.predict(XTest)
compare = pd.concat([YTest, YPred], axis=1)
compare = compare.rename(columns={0: 'prediction'})
compare['difference'] = compare['paid_in_full'] - compare['prediction']
compare.sort_values(by=['difference']) # similar to the examination of the residuals

In [None]:
compare['difference'].plot.kde()

In [None]:
# Notice that by sorting the dataframe called 'compare' by the difference
# between the actual and the predicted value, we can see the false positives
# at the top and the false negatives at the bottom.
# Let's see if we can acutually compute the number of:
# true positives
# true negatives
# false positives
# false negatives
#
# we create a function to examine the difference between the
# predicted and actual

def rating(difference):
    if difference > 0.5:
        return "false negative"
    elif difference >= 0:
        return "true positive"
    elif difference >= -0.5:
        return "true negative"
    else:
        return "false positive"

In [None]:
# we then use this function to take a look at how well we do
compare['type'] = compare.apply(lambda x: rating(x['difference']),axis=1)
true_positives = compare[compare['type'] == "true positive"].count()["type"]
true_negatives = compare[compare['type'] == "true negative"].count()["type"]
false_positives = compare[compare['type'] == "false positive"].count()["type"]
false_negatives = compare[compare['type'] == "false negative"].count()["type"]
print("True positives: ", true_positives)
print("True negatives: ", true_negatives)
print("False positives: ", false_positives)
print("False negatives: ", false_negatives)

In [None]:
print ("Precision: ",true_positives/(true_positives+false_positives))
print ("Recall: ", (true_positives/(true_positives+false_negatives)))