In [3]:
import pandas as pd
import numpy as np

In [1]:
import sys
!{sys.executable} -m pip install xlrd

Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.5/96.5 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-2.0.1
You should consider upgrading via the '/opt/homebrew/Cellar/jupyterlab/3.4.3/libexec/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

## Let's talk about a key piece of the data munging process: Missing data.

The wine data we've been working with has been really nice :) No missing values, etc.

Let's move to another interesting data set, and start to make things a little harder.

In [4]:
##we'll import a fresh data set via a URL

data = pd.read_excel('https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls')

In [5]:
#display the first few rows

data.head()

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


Right away what do you notice? Check the Index against the first column. Seems a little duplicative.

In [6]:
#re-import the data and use the first column as an Index, or just re-index.
#data.set_index(keys='Unnamed: 0', inplace=True) either will work
data = pd.read_excel('https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls', index_col=0)

In [7]:
#show the column names
data.columns

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11',
       'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21',
       'X22', 'X23', 'Y'],
      dtype='object')

In [8]:
#show the first few rows to confirm you've re-indexed correctly

data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


What else still seems weird?

In [9]:
#assign the column names to the values in the first row. It's confusing to have two sets of column names

data.columns = data.iloc[0]

In [8]:
#now drop the first row and get rid of it. now you should have a nice clean df. just a nice index on the left, and 
#column names that make sense up top.

data.drop(data.index[0], inplace=True)

In [10]:
#take a peek at the data and confirm you've done this correctly
data.head()

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


Ok, now let's get some value counts. We want to build an understanding of how many null values we're dealing with here. Before we go any further though. Let's see what datatypes we're dealing with.

In [11]:
data.dtypes

ID
LIMIT_BAL                     object
SEX                           object
EDUCATION                     object
MARRIAGE                      object
AGE                           object
PAY_0                         object
PAY_2                         object
PAY_3                         object
PAY_4                         object
PAY_5                         object
PAY_6                         object
BILL_AMT1                     object
BILL_AMT2                     object
BILL_AMT3                     object
BILL_AMT4                     object
BILL_AMT5                     object
BILL_AMT6                     object
PAY_AMT1                      object
PAY_AMT2                      object
PAY_AMT3                      object
PAY_AMT4                      object
PAY_AMT5                      object
PAY_AMT6                      object
default payment next month    object
dtype: object

Everything was read as an Object. Please pause here and take moment to read online about why DataFrames infer the dtype of Object sometimes. This is an awesome article. If you read it carefully, you'll start to understand the various Pandas dtypes.

https://pbpython.com/pandas_dtypes.html

As a quick example, let's try to take some summary stats:

In [12]:
#display summary stats for the dataframe

data.describe()

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
count,30001,30001,30001,30001,30001,30001,30001,30001,30001,30001,...,30001,30001,30001,30001,30001,30001,30001,30001,30001,30001
unique,82,3,8,5,57,12,12,12,12,11,...,21549,21011,20605,7944,7900,7519,6938,6898,6940,3
top,50000,2,2,2,29,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
freq,3365,18112,14030,15964,1605,14737,15730,15764,16455,16947,...,3195,3506,4020,5249,5396,5968,6408,6703,7173,23364


The summary stats you find above are probably not what you're expecting, right?

Seems like pandas is interpreting these variables as categorical instead of continuous. Let's fix that.

In [13]:
data = data.apply(pd.to_numeric)

ValueError: Unable to parse string "LIMIT_BAL" at position 0

In [14]:
#check the dtypes again and see if they've converted nicely

data.describe()

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
count,30001,30001,30001,30001,30001,30001,30001,30001,30001,30001,...,30001,30001,30001,30001,30001,30001,30001,30001,30001,30001
unique,82,3,8,5,57,12,12,12,12,11,...,21549,21011,20605,7944,7900,7519,6938,6898,6940,3
top,50000,2,2,2,29,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
freq,3365,18112,14030,15964,1605,14737,15730,15764,16455,16947,...,3195,3506,4020,5249,5396,5968,6408,6703,7173,23364


Looks like we now have a bunch of nice summary stats. Let's move on.

In [15]:
#display the number of na values for each column

data.isna().sum()

ID
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

Looks like we got lucky again :) Let's make things harder on ourselves and randomly apply NA values to this dataframe.

Let's do this using a mask for the dataframe.

In [16]:
#create a masking array that is a random layout of 75% false values, and 25% true. feel free to do some reading
#on the various ways of doing this.

mask = np.random.random(data.shape) < 0.25

In [17]:
#now, apply that mask to our data. let's create a new masked dataframe, instead of editing the old one:

data_with_nans = data.mask(mask)

In [17]:
#confirm that the percentage of nans we added is ~ 25%

data_with_nans.isna().sum()/len(data)

ID
LIMIT_BAL                     0.252533
SEX                           0.247567
EDUCATION                     0.244833
MARRIAGE                      0.248833
AGE                           0.252033
PAY_0                         0.253300
PAY_2                         0.248533
PAY_3                         0.248667
PAY_4                         0.250667
PAY_5                         0.252267
PAY_6                         0.249500
BILL_AMT1                     0.246533
BILL_AMT2                     0.248133
BILL_AMT3                     0.253167
BILL_AMT4                     0.248133
BILL_AMT5                     0.251233
BILL_AMT6                     0.244733
PAY_AMT1                      0.251800
PAY_AMT2                      0.250367
PAY_AMT3                      0.249367
PAY_AMT4                      0.247800
PAY_AMT5                      0.253233
PAY_AMT6                      0.247667
default payment next month    0.247967
dtype: float64

In [18]:
#show your newly created dataframe with the NaNs

data_with_nans.head()

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,
1,20000.0,2,2,1,24,2,2,-1,-1,-2,...,,,0,0,689,0,0,0,0,1.0
2,120000.0,2,2,2,26,-1,2,,0,0,...,3272,3455,,0,1000,1000,1000,0,2000,1.0
3,90000.0,2,,2,,,0,0,0,0,...,14331,14948,,,,,1000,1000,5000,0.0
4,50000.0,,2,1,,0,,,,,...,28314,28959,29547,2000,2019,1200,,,1000,0.0


Its worth mentioning here, that you'd almost never add NaNs to your data on purpose. We're just doing here to show what it's like to work with missing data.

In general, there are two basic ways of handling missing data:
 - Drop the rows with missing data. This is generally only the right answer if you wouldn't lose much of the dataset by doing so, and you think the rows containing NaNs are randomly distributed.
 
 
 - Impute some value to the NaNs. In most cases, you might consider imputing the average value of the column to the value that is missing, or depending on the nature of the data, you might impute something else (0, 1, True, False, mode, median, interpolated values from the preceding and following rows, forward filling, back filling, etc)
 
For our data above we know that about 25% of each column is NaN. This does NOT mean that if we dropped all NaNs we'd only drop 25% of rows. Let's figure out what we'd be left with if we dropped every row that doesn't contain full data.

In [19]:
#this is trivial to do in Pandas. Drop the rows with NaN values below.

non_nans = data_with_nans.dropna()

In [20]:
#print a statement that shows how many rows we started with and how many we're left with after dropping NaNs

print('The df had {} rows, but we we dropped all rows containing NaNs, \
it was reduced to {} rows.'.format(len(data_with_nans), len(non_nans)))

The df had 30001 rows, but we we dropped all rows containing NaNs, it was reduced to 36 rows.


It should be very apparent that dropping all rows containing NaNs isn't a good option here. Instead let's move on and replace each NaN with the average value for it's row.

In [21]:
#show the the means of each column:

data_with_nans.mean()

  data_with_nans.mean()


ID
LIMIT_BAL                     167216.259814
BILL_AMT2                      49450.581743
default payment next month         0.221853
dtype: float64

In [22]:
filled_with_mean = data_with_nans.fillna(data_with_nans.mean())

  filled_with_mean = data_with_nans.fillna(data_with_nans.mean())


In [23]:
#Show the first few rows of the new df. Confirm the values that were NaN are now filled with the mean for the column.

filled_with_mean.head()

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,167216.259814,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,0.221853
1,20000.0,2,2,1,24,2,2,-1,-1,-2,...,,,0,0,689,0,0,0,0,1.0
2,120000.0,2,2,2,26,-1,2,,0,0,...,3272,3455,,0,1000,1000,1000,0,2000,1.0
3,90000.0,2,,2,,,0,0,0,0,...,14331,14948,,,,,1000,1000,5000,0.0
4,50000.0,,2,1,,0,,,,,...,28314,28959,29547,2000,2019,1200,,,1000,0.0


What do you notice about the above? You should notice some shortcomings of the fill method we used. There are tradeoffs between each fill method, but the probem above should be obvious.

What do you notice?

If you don't see it right away, take a look back at the definitions of each variable:
https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

Answer: We've imputed averages to categorical variables. For example, in column two, 1=Male and 2=Female. So, imputing the average for that column isn't particularly helpful. 

## Let's save our dataframe with the NaN values. In the next lab we'll continue working on how to handle missing data.

Instead of saving it as a csv, let's introduce another cool library in Python. Pickle is used to save various objects to disk, for later use. Pickle is a popular way of saving models, data, etc.

In [24]:
#import the pickle library

import pickle

In [25]:
#save your the df you added the nans to as a .pickle file

data_with_nans.to_pickle('credit_data_with_nans.pickle')

In [26]:
#read the pickle file back in as a dataframe, to confirm you've saved it correctly

df = pd.read_pickle('credit_data_with_nans.pickle')