In [1]:
%matplotlib inline
import math
import pandas as pd
from pipeline import reading, explore, preprocess, features, classify
from sklearn.cross_validation import train_test_split

In [2]:
fts = ['RevolvingUtilizationOfUnsecuredLines', 
            'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 
            'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 
            'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 
            'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

label = 'SeriousDlqin2yrs'
filename = 'pipeline/data/cs-training.csv'

In [3]:
# read dataset
df = reading.read(filename)

In [4]:
# divide dataset to train and test
xtrain, xtest, ytrain, ytest = train_test_split(df[fts], df[label])
train = xtrain.copy()
train[label] = ytrain
test = xtest.copy()
test[label] = ytest
df = train

In [5]:
# generate statistics and generic exploration histograms
explore.statistics_csv(df)
# explore.plots_csv(df)

Observations:
112500

11 features:
    1) RevolvingUtilizationOfUnsecuredLines
    2) age
    3) NumberOfTime30-59DaysPastDueNotWorse
    4) DebtRatio
    5) MonthlyIncome
    6) NumberOfOpenCreditLinesAndLoans
    7) NumberOfTimes90DaysLate
    8) NumberRealEstateLoansOrLines
    9) NumberOfTime60-89DaysPastDueNotWorse
    10) NumberOfDependents
    11) SeriousDlqin2yrs


Sample:
        RevolvingUtilizationOfUnsecuredLines  age  \
115676                              0.001719   53   
115318                              0.001905   88   
132731                              0.068388   53   
20881                               0.060674   63   
130428                              0.176931   55   

        NumberOfTime30-59DaysPastDueNotWorse    DebtRatio  MonthlyIncome  \
115676                                     0     0.312033        10437.0   
115318                                     0     0.000000            NaN   
132731                                     0     1.397557         180

In [6]:
# impute null values with mean value and transform income to log(income)
preprocess.impute_csv(df)
preprocess.transform_feature(df, 'MonthlyIncome', lambda x: math.log(x + 1))

0) RevolvingUtilizationOfUnsecuredLines has null values: False.
1) age has null values: False.
2) NumberOfTime30-59DaysPastDueNotWorse has null values: False.
3) DebtRatio has null values: False.
4) MonthlyIncome has null values: True.
  Filling nulls with mean.
5) NumberOfOpenCreditLinesAndLoans has null values: False.
6) NumberOfTimes90DaysLate has null values: False.
7) NumberRealEstateLoansOrLines has null values: False.
8) NumberOfTime60-89DaysPastDueNotWorse has null values: False.
9) NumberOfDependents has null values: True.
  Filling nulls with mean.
10) SeriousDlqin2yrs has null values: False.


In [7]:
# create a feature of income quartile
features.binning(df, 'f(MonthlyIncome)', 'quantiles', [0, 0.25, 0.5, 0.75, 1])

In [8]:
models = ['LR', 'KNN', 'DT', 'SVM', 'RF', 'BOO', 'BAG']

In [9]:
classify.classify(df[fts], df[label], models, 0.5)

LR
LogisticRegression(C=1e-05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LogisticRegression(C=1e-05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solve

KeyboardInterrupt: 