In [2]:
#Import necessary packages, some of these are used for making linear and logistic regression models,
#so I though I would put it up here

import pandas as pd
import numpy as np
from random import random
from random import seed
seed(1)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import scipy.stats as stats

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)},font_scale=1)
from matplotlib import pyplot as plt

%matplotlib inline
sns.set()

Some times, there is data is a dataset that cannot be interpreted correctly by a model. Generally, this can be attributed to categorical data, which usually come in the form of strings or numbers whose relative values to each other do not matter. For example, in our Credit Card Default Data set, the education column is broken in "1,2,3,4", but the number themselves have no value because they represent something else, "high school, bachelors, masters, and PhD". This means that we need to create dummy variables in the pandas dataframe that the model can interpret.

We would first import our data and put it into a dataframe

In [3]:
data = pd.read_csv('/Users/Jesse/Desktop/insight_team/CreditCardDefault.csv')
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
df = pd.DataFrame(data)

Then, we would take a column, and then turn it into a dummy categorical data. Essentially, what this does is that it makes a separate column for each possible answer previously in that column. So, in education, it would make 'education_1', 'education_2',... etc. for all the possible responses in education. So if education had number 1 through 5, then there would be 5 columns made. Then each column stands for a specific response. So, 'education_1' will only say true if that person's education was 1. 'education_2' will only say true if that person's education was 2. The rest of the columns would then say false. The way the dataframe says true and false is through 1's and 0's. So if that column has a 1 in it, it means that person has that level of education.

In [14]:
#Here is what the code looks like, I'll use education as the example

df['EDUCATION']=pd.Categorical(df['EDUCATION'])
dfEDUCATIONDummies = pd.get_dummies(df['EDUCATION'],prefix='EDUCATION', drop_first=True)
df = pd.concat([df,dfEDUCATIONDummies],axis=1)

Here, we created a dummy variable for the categorical variable: education. So, the first line of code shows us taking the education column from the dataframe and making it categorical. Then we create our own dataframe called dfEDUCATIONDummies and make it a dataframe with the dummy categorical variable. Then we use the pd.concat function to add the dummy variable back to the original dataframe.

Finally, we would delete the old non-dummy education variable from the dataframe because the model cannot interpret it and it would be redundant. we would use the pd.drop function and also create our target variable for the model to train toward.

In [15]:
df.drop(['EDUCATION'],axis=1,inplace=True)

target = data['default payment next month']

In [16]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,EDUCATION_1,EDUCATION_2,EDUCATION_3.1,EDUCATION_4.1,EDUCATION_5.1,EDUCATION_6.1
0,1,20000,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,0,1,0,0,0,0
1,2,120000,2,2,26,-1,2,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,90000,2,2,34,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,50000,2,1,37,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,50000,1,1,57,-1,0,-1,0,0,...,0,0,0,0,0,1,0,0,0,0


If you scroll over, you will see that education is no longer at the beginning of the dataframe, rather EDUCATION_1,	EDUCATION_2, EDUCATION_3, EDUCATION_4, EDUCATION_5, and EDUCATION_6 are at the end of the dataframe. Notice how the columns are only composed of 0's and 1's.