In [72]:
%matplotlib inline
import math
import pandas as pd
from pipeline import reading, explore, preprocess, features, classify
from sklearn.cross_validation import train_test_split

In [64]:
fts = ['RevolvingUtilizationOfUnsecuredLines', 
            'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 
            'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 
            'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 
            'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

label = 'SeriousDlqin2yrs'
filename = 'pipeline/data/cs-training.csv'

In [65]:
# read dataset
df = reading.read(filename)

In [66]:
# divide dataset to train and test
xtrain, xtest, ytrain, ytest = train_test_split(df[fts], df[label])
train = xtrain.copy()
train[label] = ytrain
test = xtest.copy()
test[label] = ytest
df = train

In [67]:
# generate statistics and generic exploration histograms
explore.statistics_csv(df)
# explore.plots_csv(df)

Observations:
112500

11 features:
    1) RevolvingUtilizationOfUnsecuredLines
    2) age
    3) NumberOfTime30-59DaysPastDueNotWorse
    4) DebtRatio
    5) MonthlyIncome
    6) NumberOfOpenCreditLinesAndLoans
    7) NumberOfTimes90DaysLate
    8) NumberRealEstateLoansOrLines
    9) NumberOfTime60-89DaysPastDueNotWorse
    10) NumberOfDependents
    11) SeriousDlqin2yrs


Sample:
        RevolvingUtilizationOfUnsecuredLines  age  \
43177                               0.273508   56   
124336                              0.986217   52   
91082                               0.649800   46   
88907                               0.628781   34   
139257                              0.025765   63   

        NumberOfTime30-59DaysPastDueNotWorse    DebtRatio  MonthlyIncome  \
43177                                      0     1.219853         3938.0   
124336                                     4  3177.000000            NaN   
91082                                      0     0.336819         458

In [68]:
# impute null values with mean value and transform income to log(income)
preprocess.impute_csv(df)
preprocess.transform_feature(df, 'MonthlyIncome', lambda x: math.log(x + 1))

0) RevolvingUtilizationOfUnsecuredLines has null values: False.
1) age has null values: False.
2) NumberOfTime30-59DaysPastDueNotWorse has null values: False.
3) DebtRatio has null values: False.
4) MonthlyIncome has null values: True.
  Filling nulls with mean.
5) NumberOfOpenCreditLinesAndLoans has null values: False.
6) NumberOfTimes90DaysLate has null values: False.
7) NumberRealEstateLoansOrLines has null values: False.
8) NumberOfTime60-89DaysPastDueNotWorse has null values: False.
9) NumberOfDependents has null values: True.
  Filling nulls with mean.
10) SeriousDlqin2yrs has null values: False.


In [69]:
# create a feature of income quartile
features.binning(df, 'f(MonthlyIncome)', 'quantiles', [0, 0.25, 0.5, 0.75, 1])

In [70]:
models = ['LR', 'KNN', 'DT', 'SVM', 'RF', 'BOO', 'BAG']

In [73]:
classify.classify(df[fts], df[label], models, 0.5)

TypeError: classify() takes 3 positional arguments but 4 were given