In [42]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data = pd.read_csv('cs-training.csv').drop('Unnamed: 0', axis=1)

# Remove rows win NaN values
data = data.dropna()

data.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


## Initial exploration

### Data dictionary

* **SeriousDlqin2yrs**: Person experienced 90 days past due delinquency or worse

* **RevolvingUtilizationOfUnsecuredLines**: Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits

* **age**: Age of borrower in years

* **NumberOfTime30-59DaysPastDueNotWorse**: Number of times borrower has been 30-59 days past due but no worse in the last 2 years.

* **DebtRatio**: Monthly debt payments, alimony,living costs divided by monthy gross income

* **MonthlyIncome**: Monthly income

* **NumberOfOpenCreditLinesAndLoans**: Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards)

* **NumberOfTimes90DaysLate**: Number of times borrower has been 90 days or more past due.

* **NumberRealEstateLoansOrLines**: Number of mortgage and real estate loans including home equity lines of credit

* **NumberOfTime60-89DaysPastDueNotWorse**: Number of times borrower has been 60-89 days past due but no worse in the last 2 years.

* **NumberOfDependents**: Number of dependents in family excluding themselves (spouse, children etc.)

In [43]:
# Calculate number of samples
n_samples = data.shape[0]

# Calculate number of features
n_features = data.shape[1] - 1

# Calculate positive cases
n_positive = data[data.SeriousDlqin2yrs == 1].shape[0]

# TODO: Calculate failing students
n_negative = data[data.SeriousDlqin2yrs == 0].shape[0]

# TODO: Calculate positive rate
positive_rate = 100 * (n_positive / float(n_samples))

# Print the results
print "Total number of samples: {}".format(n_samples)
print "Number of features: {}".format(n_features)
print "Number of positives: {}".format(n_positive)
print "Number of negatives: {}".format(n_negative)
print "Positive rate: {:.2f}%".format(positive_rate)

Total number of samples: 120269
Number of features: 10
Number of positives: 8357
Number of negatives: 111912
Positive rate: 6.95%


In [44]:
# Extract feature columns
feature_cols = list(data.columns[1:])

# Extract target column 'SeriousDlqin2yrs'
target_col = data.columns[0]

# Show the list of columns
print "Feature columns:\n{}".format(feature_cols)
print "\nTarget column: {}".format(target_col)

# Separate the data into feature data and target data (X_all and y_all, respectively)
X_all = data[feature_cols]
y_all = data[target_col]

Feature columns:
['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

Target column: SeriousDlqin2yrs
