In [1]:
%matplotlib inline
import math
import pandas as pd
from pipeline import reading, explore, preprocess, features, classify
from sklearn.cross_validation import train_test_split

In [2]:
fts = ['RevolvingUtilizationOfUnsecuredLines', 
            'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 
            'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 
            'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 
            'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

label = 'SeriousDlqin2yrs'
filename = 'pipeline/data/cs-training.csv'

In [3]:
# read dataset
df = reading.read(filename)

In [4]:
# divide dataset to train and test
xtrain, xtest, ytrain, ytest = train_test_split(df[fts], df[label])
train = xtrain.copy()
train[label] = ytrain
test = xtest.copy()
test[label] = ytest
df = train

In [5]:
# generate statistics and generic exploration histograms
# explore.statistics_csv(df)
# explore.plots_csv(df)

In [6]:
# impute null values with mean value and transform income to log(income)
preprocess.impute_csv(df)
preprocess.transform_feature(df, 'MonthlyIncome', lambda x: math.log(x + 1))

0) RevolvingUtilizationOfUnsecuredLines has null values: False.
1) age has null values: False.
2) NumberOfTime30-59DaysPastDueNotWorse has null values: False.
3) DebtRatio has null values: False.
4) MonthlyIncome has null values: True.
  Filling nulls with mean.
5) NumberOfOpenCreditLinesAndLoans has null values: False.
6) NumberOfTimes90DaysLate has null values: False.
7) NumberRealEstateLoansOrLines has null values: False.
8) NumberOfTime60-89DaysPastDueNotWorse has null values: False.
9) NumberOfDependents has null values: True.
  Filling nulls with mean.
10) SeriousDlqin2yrs has null values: False.


In [7]:
# create a feature of income quartile
features.binning(df, 'f(MonthlyIncome)', 'quantiles', [0, 0.25, 0.5, 0.75, 1])

In [8]:
#models = ['LR', 'KNN', 'DT', 'SVM', 'RF', 'BOO', 'BAG']
# models = ['SVM', 'RF', 'BOO', 'BAG']
models = ['LR', 'KNN']

In [9]:
results, models = classify.classify(df[fts], df[label], models, 3, 0.05, 'auc')

LR
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Iteration 0.
Iteration 1.
Iteration 2.
	New best model.
Finished: {'C': 0.1, 'penalty': 'l1'} model.

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Iteration 0.
Iteration 1.
Iteration 2.
Finished: {'C': 0.1, 'penalty': 'l2'} model.

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Iteration 0.
Iter

  result = result.union(other)


Iteration 1.
Iteration 2.
	New best model.
Finished: {'weights': 'uniform', 'algorithm': 'auto', 'n_neighbors': 5} model.

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='distance')
Iteration 0.
Iteration 1.
Iteration 2.
Finished: {'weights': 'distance', 'algorithm': 'auto', 'n_neighbors': 5} model.

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')
Iteration 0.
Iteration 1.
Iteration 2.
	New best model.
Finished: {'weights': 'uniform', 'algorithm': 'auto', 'n_neighbors': 10} model.

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='distance')
Iteration 0.
Iteration 1.
Iteration 2.
Finished: {'weights': 'distance', 'algorithm': 'auto', 'n_neighbors': 10} model.

Fin

  result = result.union(other)


In [10]:
results


Unnamed: 0,time,avg_auc,params
LR,,,
KNN,,,
