In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pipeline as pl

OUTCOME_VAR = 'SeriousDlqin2yrs'
INDEX_COL = 0

# Read in data and generate summary statistics

In [2]:
df = pl.read_data('cs-training.csv', INDEX_COL)

In [3]:
cols = list(df.columns.values)
features = [x for x in cols if x != OUTCOME_VAR]

In [4]:
features

['RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfDependents']

In [5]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120,13,0,6,0,2
2,0,0.957151,40,0,0.121876,2600,4,0,0,0,1
3,0,0.65818,38,1,0.085113,3042,2,1,0,0,0
4,0,0.23381,30,0,0.03605,3300,5,0,0,0,0
5,0,0.907239,49,1,0.024926,63588,7,0,1,0,0


In [6]:
pl.explore_data(df)

### Summary Statistics ###
       SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines            age  \
count     150000.000000                         150000.000000  150000.000000   
mean           0.066840                              6.048438      52.295207   
std            0.249746                            249.755371      14.771866   
min            0.000000                              0.000000       0.000000   
25%            0.000000                              0.029867      41.000000   
50%            0.000000                              0.154181      52.000000   
75%            0.000000                              0.559046      63.000000   
max            1.000000                          50708.000000     109.000000   

       NumberOfTime30-59DaysPastDueNotWorse      DebtRatio   MonthlyIncome  \
count                         150000.000000  150000.000000   120269.000000   
mean                               0.421033     353.005076     6670.221237   
std               

# Fill null values with conditional mean

In [7]:
pl.fill_na_mean(df, cols_to_fill=False, conditional_mean=True, group_col=OUTCOME_VAR)

# Generate histograms

In [8]:
for c in features:
    pl.make_hist(df, c, 10)

Plotting RevolvingUtilizationOfUnsecuredLines
Plotting age
Plotting NumberOfTime30-59DaysPastDueNotWorse
Plotting DebtRatio
Plotting MonthlyIncome
Plotting NumberOfOpenCreditLinesAndLoans
Plotting NumberOfTimes90DaysLate
Plotting NumberRealEstateLoansOrLines
Plotting NumberOfTime60-89DaysPastDueNotWorse
Plotting NumberOfDependents


# Test functions to convert to discrete and binary variables

In [9]:
pl.discretize(df, 'NumberOfDependents', bins=3, labels=['low','med','high'])

In [10]:
df = pl.binarize(df, 'NumberOfDependents_disc')

# Fit logistic regression model with training data

In [20]:
scores, lgr = pl.log_reg(df, features, OUTCOME_VAR)

Average accuracy for this model:
0.933746690875


# Read in and fill test data

In [21]:
test_df = pl.read_data('cs-test.csv', INDEX_COL)

In [22]:
pl.fill_na_mean(test_df, cols_to_fill=False, conditional_mean=False)

# Predict outcomes for test data

In [23]:
pl.predict_log_reg(test_df, features, lgr)

In [24]:
predicted = pl.read_data('predictions.csv', INDEX_COL)

In [25]:
pl.explore_data(predicted)

### Summary Statistics ###
       SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines            age  \
count                 0                         101503.000000  101503.000000   
mean                NaN                              5.310000      52.405436   
std                 NaN                            196.156039      14.779756   
min                 NaN                              0.000000      21.000000   
25%                 NaN                              0.030131      41.000000   
50%                 NaN                              0.152586      52.000000   
75%                 NaN                              0.564225      63.000000   
max                 NaN                          21821.000000     104.000000   

       NumberOfTime30-59DaysPastDueNotWorse      DebtRatio   MonthlyIncome  \
count                         101503.000000  101503.000000   101503.000000   
mean                               0.453770     344.475020     6855.035590   
std               

In [26]:
predicted['predicted'].value_counts()

0    101127
1       376
Name: predicted, dtype: int64

In [28]:
376/101503

0.0037043240101277794