## Importing packages

In [1]:
! pip3 install seaborn 
! pip3 install scipy
! pip3 install matplotlib
! pip3 install sklearn
! pip3 install warnings
! pip3 install statsmodels
! pip3 install patsy




In [2]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
from matplotlib import cm

from scipy.special import inv_boxcox

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import plot_confusion_matrix


import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from patsy import dmatrices
import statsmodels.api as sm

from scipy.stats import chi2_contingency

ModuleNotFoundError: No module named 'statsmodels'

## Loading data

Defining X (features) and y (target variable) and standardizing column headers before we move on...

In [3]:
df = pd.read_csv("data/creditcardmarketing.csv")

In [None]:
def clean_headers(df):
    cols = []
    for c in df.columns:
        c = c.lower().replace(" ", "_").replace("#", "n")
        cols.append(c)
    df.columns = cols


In [None]:
clean_headers(df)

## Exploring data

In [None]:
df.shape

In [None]:
pd.set_option('display.max_columns', None)
df.head(3)

### Exploring Categoricals

In [None]:
df_cat = df.select_dtypes('object')
df_cat.head()

#### Lets see the number of distinct values each column has and what % of the total values belong to each type.

In [None]:
def cat_exploration(df):
    cat = df.select_dtypes('object')
    for c in cat.columns:
        sns.set_style("darkgrid")
        print(c)
        print(cat[c].value_counts(normalize=True).mul(100).round(1))
        fig, axes = plt.subplots(1, 1, figsize=(7, 4))
        sns.countplot(cat[c], color = 'gray')
        plt.show()

In [None]:
cat_exploration(df)

Takeaways:
- There isn't any categorical column with too many different value types which might require bucketing.
- The largest imbalance occurs with our target variable: "offer_accepted"

### Exploring numericals

In [None]:
def num_exploration(df):
    num = df.select_dtypes('number')
    for c in num.columns:
        print(c)
        fig, axes = plt.subplots(1, 2, figsize=(10, 3))
        sns.set_style("dark")
        sns.distplot(num[c], ax=axes[0],  color = 'gray')
        sns.boxplot(num[c], ax=axes[1],  color = 'gray')
        plt.show()
    sns.pairplot(num)
    plt.show()
    return num.describe().apply(lambda x: round(x,2))

In [None]:
num_exploration(df)

## Check correlations:

We are checking the correlation of the data by also creating modifications of the original one by dropping certain columns that have a lot of correlation. We will later see how do they dirrerently affect the model.

We create the different dfs

In [None]:
df1 = df.drop('average_balance', axis=1)
df2 = df.drop(['q1_balance','q2_balance','q3_balance','q4_balance'], axis=1)


We define the three correlation matrixes

In [None]:
corr_matrix=df.corr(method='pearson')
corr_matrix1=df1.corr(method='pearson')
corr_matrix2=df2.corr(method='pearson')

We plot them:

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize=(10, 27))

sns.heatmap(corr_matrix, ax=ax1, annot=True, cmap="YlGnBu")
sns.heatmap(corr_matrix1, ax=ax2, annot=True, cmap="YlGnBu")
sns.heatmap(corr_matrix2, ax=ax3, annot=True, cmap="YlGnBu")

ax1.set_title('corr_matrix: df')
ax2.set_title('corr_matrix1: df1')
ax3.set_title('corr_matrix2: df2')
fig.tight_layout(pad=3.0)


plt.show()

### Checking null values:

In [None]:
def checking_nulls(df):
    for c in df.columns:
        null_count = df[c].isnull().sum()
        if null_count > 0:
            print ("The column ", c, " has ", null_count, " null values")
    nulls = df[df.isna().any(axis=1)]
    return nulls.head()

In [None]:
checking_nulls(df)

We will deal with null values later.

## Cleaning data

### Dealing with irrelevant columns

customer_number is important for the predictions, but i won't use it as a feature (unique values)


In [None]:
df = df.set_index('customer_number')

### Dealing with null values

- We will replace null values with the mean as the data distribution for columns with null values is "slightly" normal (see previous graphs)

In [None]:
def replace_nulls_mean(df):
    for c in df.columns:
        null_count = df[c].isnull().sum()
        if null_count > 0:
            df[c].fillna((df[c].mean()), inplace=True)
    return df

In [None]:
df = replace_nulls_mean(df)
checking_nulls(df)

## Multicollinearity analysis

In [None]:
features = "+".join(['reward', 'mailer_type', 'income_level',
       'n_bank_accounts_open', 'overdraft_protection', 'credit_rating',
       'n_credit_cards_held', 'n_homes_owned', 'household_size',
       'own_your_home', 'average_balance', 'q1_balance', 'q2_balance',
       'q3_balance', 'q4_balance'])

y_enc, X_enc = dmatrices('offer_accepted ~' + features, df, return_type='dataframe')

In [None]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X_enc.values, i) for i in range(X_enc.shape[1])]
vif["features"] = X_enc.columns

vif.round(2)

## Chi Square test

In [None]:
def chi_square_test(df):
    cat = df.select_dtypes('object')    
    cols = cat.columns
    for i in cols:
        for j in cols:
            if i != j:
                data_crosstab = pd.crosstab(df[i], df[j], margins = False)
                chi2_stat, p_val, dof, ex = stats.chi2_contingency(data_crosstab, correction=False)
                print("The chi2_contingency for ", i, " and ", j," is:" )
                print("===Chi2 Stat===")
                print(chi2_stat)
                print("===Degrees of Freedom===")
                print(dof)
                print("===P-Value===")
                print(float(p_val))
                print ("{:.60f}".format(float(p_val)))
                print("===Contingency Table===")
                print(ex)
                print("\n")            

In [None]:
chi_square_test(df)

## Preparing data to fit the model

In [None]:
X = df.drop('offer_accepted', axis=1)
y = df['offer_accepted']

Separating numericals from categoricals in X

In [None]:
X_cat = X.select_dtypes('object')
X_num = X.select_dtypes('number')

Encoding categoricals

In [None]:
X_cat_enc = pd.get_dummies(X_cat, drop_first=True)
y_enc = pd.get_dummies(y, drop_first=True)

In [None]:
X_model = np.concatenate([X_num, X_cat_enc], axis=1)
y_model = y_enc

## Create Training and Test Sets and Apply Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, random_state=0)

## Building models

### Logistic regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))

In [None]:
score = logreg.score(X_test, y_test)
score

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(confusion_matrix, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()