In [650]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.impute import MissingIndicator, SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel

# plot_confusion_matrix is a handy visual tool, added in the latest version of scikit-learn
# if you are running an older version, comment out this line and just use confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve

In [651]:
df = pd.read_csv('./clean_dataset.csv')

In [652]:
df.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,1,1,Industrials,White,1.25,1,1,1,0,ByBirth,202,0,1
1,0,58.67,4.46,1,1,Materials,Black,3.04,1,1,6,0,ByBirth,43,560,1
2,0,24.5,0.5,1,1,Materials,Black,1.5,1,0,0,0,ByBirth,280,824,1
3,1,27.83,1.54,1,1,Industrials,White,3.75,1,1,5,1,ByBirth,100,3,1
4,1,20.17,5.625,1,1,Industrials,White,1.71,1,0,0,0,ByOtherMeans,120,0,1


In [653]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    int64  
 1   Age             690 non-null    float64
 2   Debt            690 non-null    float64
 3   Married         690 non-null    int64  
 4   BankCustomer    690 non-null    int64  
 5   Industry        690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    int64  
 9   Employed        690 non-null    int64  
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    int64  
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    int64  
 14  Income          690 non-null    int64  
dtypes: float64(3), int64(9), object(3)
memory usage: 81.0+ KB


In [654]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    int64  
 1   Age             690 non-null    float64
 2   Debt            690 non-null    float64
 3   Married         690 non-null    int64  
 4   BankCustomer    690 non-null    int64  
 5   Industry        690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    int64  
 9   Employed        690 non-null    int64  
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    int64  
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    int64  
 14  Income          690 non-null    int64  
 15  Approved        690 non-null    int64  
dtypes: float64(3), int64(10), object(3)
memory usage: 86.4+ KB


In [655]:
df['Industry'].value_counts()

Energy                   146
Materials                 78
Industrials               64
ConsumerDiscretionary     59
ConsumerStaples           54
Healthcare                53
Financials                51
InformationTechnology     41
CommunicationServices     38
Utilities                 38
Real Estate               30
Education                 25
Research                  10
Transport                  3
Name: Industry, dtype: int64

In [656]:
df['Ethnicity'].value_counts()

White     408
Black     138
Asian      59
Latino     57
Other      28
Name: Ethnicity, dtype: int64

In [657]:
df['Approved'].value_counts()

0    383
1    307
Name: Approved, dtype: int64

In [658]:
numeric_columns = ["Gender", "Age", "Debt", "Married","BankCustomer","YearsEmployed","PriorDefault","Employed","CreditScore",'Income','DriversLicense','ZipCode']

In [659]:
X = df[numeric_columns]
y = df["Approved"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

# Create Baseline dummy model 

In [660]:
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(X_train, y_train)
simple_model.fit(X_test, y_test)


LogisticRegression(max_iter=10000, random_state=42)

In [661]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

In [662]:
dummy_model_results = ModelWithCV(
                        model=dummy_model,
                        model_name='dummy',
                        X=X_train, 
                        y=y_train
)

In [663]:
dummy_model_results.print_cv_summary()

CV Results for `dummy` model:
            0.56290 ± 0.00677 accuracy
        


In [664]:
X = df.drop("Approved", axis=1)
y = df["Approved"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

In [665]:
categorical_feature_names = ['Ethnicity','Industry','Citizen']

In [666]:
def encode_and_concat_feature_train(X_train, feature_name):
    """
    Helper function for transforming training data.  It takes in the full X dataframe and
    feature name, makes a one-hot encoder, and returns the encoder as well as the dataframe
    with that feature transformed into multiple columns of 1s and 0s
    """
    # make a one-hot encoder and fit it to the training data
    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    single_feature_df = X_train[[feature_name]]
    ohe.fit(single_feature_df)
    
    # call helper function that actually encodes the feature and concats it
    X_train = encode_and_concat_feature(X_train, feature_name, ohe)
    
    return ohe, X_train

In [667]:
def encode_and_concat_feature(X, feature_name, ohe):
    """
    Helper function for transforming a feature into multiple columns of 1s and 0s. Used
    in both training and testing steps.  Takes in the full X dataframe, feature name, 
    and encoder, and returns the dataframe with that feature transformed into multiple
    columns of 1s and 0s
    """
    # create new one-hot encoded df based on the feature
    single_feature_df = X[[feature_name]]
    feature_array = ohe.transform(single_feature_df).toarray()
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0], index=X.index)
    
    # drop the old feature from X and concat the new one-hot encoded df
    X = X.drop(feature_name, axis=1)
    X = pd.concat([X, ohe_df], axis=1)
    
    return X

In [668]:
encoders = {}

for categorical_feature in categorical_feature_names:
    ohe, X_train = encode_and_concat_feature_train(X_train, categorical_feature)
    encoders[categorical_feature] = ohe

In [669]:
X_train.head()



Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,...,Industrials,InformationTechnology,Materials,Real Estate,Research,Transport,Utilities,ByBirth,ByOtherMeans,Temporary
194,1,34.5,4.04,0,0,8.5,1,1,7,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
282,1,17.67,4.46,1,1,0.25,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
226,1,18.08,5.5,1,1,0.5,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
134,1,32.67,5.5,1,1,5.5,1,1,12,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
360,0,30.25,5.5,1,1,5.5,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [670]:
X_train.rename(columns = {'Real Estate':'Real_Estate'}, inplace = True)


In [671]:
X_train.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,...,Industrials,InformationTechnology,Materials,Real_Estate,Research,Transport,Utilities,ByBirth,ByOtherMeans,Temporary
194,1,34.5,4.04,0,0,8.5,1,1,7,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
282,1,17.67,4.46,1,1,0.25,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
226,1,18.08,5.5,1,1,0.5,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
134,1,32.67,5.5,1,1,5.5,1,1,12,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
360,0,30.25,5.5,1,1,5.5,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [672]:
X_test.rename(columns = {'Real Estate':'Real_Estate'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.rename(columns = {'Real Estate':'Real_Estate'}, inplace = True)


In [673]:
X_test.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income
358,1,32.42,3.0,1,1,Real Estate,White,0.165,0,0,0,1,ByBirth,120,0
207,1,28.67,9.335,1,1,Materials,Black,5.665,1,1,6,0,ByBirth,381,168
230,1,25.17,3.5,1,1,InformationTechnology,White,0.625,1,1,7,0,ByBirth,0,7059
28,1,57.42,8.5,1,1,Education,Black,7.0,1,1,3,0,ByBirth,0,0
325,0,29.5,1.085,0,0,Utilities,White,1.0,0,0,0,0,ByBirth,280,13


# 2nd Model Logistic Regression

In [675]:
simple_model = LogisticRegression(random_state=42, max_iter=10000)
simple_model.fit(X_train, y_train)


LogisticRegression(max_iter=10000, random_state=42)

In [676]:
simple_model_results = ModelWithCV(
                        model=simple_model,
                        model_name='simple',
                        X=X_train, 
                        y=y_train
)

In [677]:
simple_model_results.print_cv_summary()

CV Results for `simple` model:
            0.86048 ± 0.04305 accuracy
        


# 3rd Model Logisitic Regression with Scaling

In [680]:
#3rd Model Logisitic Regression with Scaling
scaler = StandardScaler()

scaler.fit(X_train)


StandardScaler()

In [681]:
def scale_values(X, scaler):
    """
    Given a DataFrame and a fitted scaler, use the scaler to scale all of the features
    """
    scaled_array = scaler.transform(X)
    scaled_df = pd.DataFrame(scaled_array, columns=X.columns, index=X.index)
    return scaled_df

In [682]:
X_train_scaled = scale_values(X_train, scaler)
X_test_scaled = scale_values(X_test,scaler)

ValueError: could not convert string to float: 'Real Estate'

In [683]:
final_model = LogisticRegression(random_state=42, max_iter=10000)
final_model.fit(X_train_scaled, y_train)
final_model.fit(X_test_scaled,y_test)

LogisticRegression(max_iter=10000, random_state=42)

In [684]:
final_model_results = ModelWithCV(
                            final_model,
                            'all_features_scaled',
                            X_train_scaled,
                            y_train
)

In [685]:
final_model_results.print_cv_summary()

CV Results for `all_features_scaled` model:
            0.83918 ± 0.04358 accuracy
        


In [686]:
final_model_test_results = ModelWithCV(
                            final_model,
                            'all_features_scaled',
                            X_test_scaled,
                            y_test
)

In [687]:
final_model_test_results.print_cv_summary()

CV Results for `all_features_scaled` model:
            0.46340 ± 0.10136 accuracy
        
