In [1]:
%matplotlib inline
import math
import pandas as pd
from pipeline import reading, explore, preprocess, features, classify
from sklearn.cross_validation import train_test_split

In [2]:
fts = ['RevolvingUtilizationOfUnsecuredLines', 
            'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 
            'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 
            'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 
            'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

label = 'SeriousDlqin2yrs'
filename = 'pipeline/data/cs-training.csv'

In [3]:
# read dataset
df = reading.read(filename)

In [4]:
# divide dataset to train and test
xtrain, xtest, ytrain, ytest = train_test_split(df[fts], df[label])
train = xtrain.copy()
train[label] = ytrain
test = xtest.copy()
test[label] = ytest
df = train

In [5]:
# generate statistics and generic exploration histograms
# explore.statistics_csv(df)
# explore.plots_csv(df)

In [6]:
# impute null values with mean value and transform income to log(income)
preprocess.impute_csv(df)
preprocess.transform_feature(df, 'MonthlyIncome', lambda x: math.log(x + 1))

0) RevolvingUtilizationOfUnsecuredLines has null values: False.
1) age has null values: False.
2) NumberOfTime30-59DaysPastDueNotWorse has null values: False.
3) DebtRatio has null values: False.
4) MonthlyIncome has null values: True.
  Filling nulls with mean.
5) NumberOfOpenCreditLinesAndLoans has null values: False.
6) NumberOfTimes90DaysLate has null values: False.
7) NumberRealEstateLoansOrLines has null values: False.
8) NumberOfTime60-89DaysPastDueNotWorse has null values: False.
9) NumberOfDependents has null values: True.
  Filling nulls with mean.
10) SeriousDlqin2yrs has null values: False.


In [7]:
# create a feature of income quartile
features.binning(df, 'f(MonthlyIncome)', 'quantiles', [0, 0.25, 0.5, 0.75, 1])

In [10]:
# models = ['LR', 'KNN', 'DT', 'SVM', 'RF', 'BOO', 'BAG']
models = ['SVM', 'RF', 'BOO', 'BAG']

In [12]:
results = classify.classify(df[fts], df[label], models, 0.05)

SVM
LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
precision: 0.5714285714285714
recall: 0.01845748187211602
f1: 0.03575989782886335
area_under_curve: 0.5087283358462233
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
precision: 0.5929203539823009
recall: 0.044166117336849046
f1: 0.08220858895705521
area_under_curve: 0.5209869332335486
RF
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
   

KeyError: 'BOO'

In [13]:
results

NameError: name 'results' is not defined