In [38]:
!ls

bank-classification.csv  experiments.ipynb  README.md  sample_submission.csv


In [64]:
!head sample_submission.csv

id,y
2,0.114428578297
4,0.114428578297
5,0.114428578297
7,0.114428578297
10,0.114428578297
12,0.114428578297
14,0.114428578297
16,0.114428578297
18,0.114428578297


In [202]:
import pandas as pd
df = pd.read_csv('bank-classification.csv')
df = df.sample(frac=1, replace=False, random_state=42)
print(df[:5])

          id  birth_date          job  marital    education  default housing  \
32884  32885  1952-07-15   technician  married  high.school       no      no   
3169    3170  1953-06-30      unknown  married      unknown  unknown     yes   
32206  32207  1976-09-12  blue-collar  married     basic.9y       no      no   
9403    9404  1972-04-10       admin.  married  high.school       no      no   
14020  14021  1981-05-25    housemaid  married  high.school       no     yes   

      loan contact_date    contact  campaign  pdays  previous     poutcome  \
32884  yes   2009-05-04   cellular         1    999         1      failure   
3169    no   2008-05-08  telephone         2    999         0  nonexistent   
32206   no   2009-05-08   cellular         1    999         1      failure   
9403    no   2008-06-27  telephone         4    999         0  nonexistent   
14020   no   2008-07-25   cellular         2    999         0  nonexistent   

             y  
32884  unknown  
3169   unknown  

In [207]:
train_df = df.loc[df['y'] != 'unknown']
train_df['y'] = train_df['y'].map({'no': 0, 'yes': 1})
test_df = df.loc[df['y'] == 'unknown']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [209]:
train_df.get(['id', 'y']).groupby(['y']).count()

Unnamed: 0_level_0,id
y,Unnamed: 1_level_1
0,18419
1,2380


In [210]:
train_df.get(['id', 'job']).groupby(['job']).count()

Unnamed: 0_level_0,id
job,Unnamed: 1_level_1
admin.,5284
blue-collar,4708
entrepreneur,745
housemaid,522
management,1485
retired,874
self-employed,699
services,1995
student,444
technician,3365


In [50]:
train_df.get(['id', 'education']).groupby(['education']).count()

Unnamed: 0_level_0,id
education,Unnamed: 1_level_1
basic.4y,2154
basic.6y,1170
basic.9y,3016
high.school,4781
illiterate,7
professional.course,2592
university.degree,6192
unknown,887


In [51]:
train_df.get(['id', 'marital']).groupby(['marital']).count()

Unnamed: 0_level_0,id
marital,Unnamed: 1_level_1
divorced,2347
married,12512
single,5896
unknown,44


In [296]:
import numpy as np
import matplotlib.pyplot as plt
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder


def get_year(date):
    year_str = date[:4]
    year = int(year_str)
    return year

def get_categorical(column):
    le = LabelEncoder()
    le.fit(column)
    labeled_column = le.transform(column)
    return to_categorical(labeled_column).T

def get_input_data(df):
    ages = np.array([[get_year(contact) - get_year(birth) for contact, birth in
                     zip(df['contact_date'], df['birth_date'])]])
    
    jobs = get_categorical(df['job'])
    education = get_categorical(df['education'])
    marital = get_categorical(df['marital'])
    loan = get_categorical(df['loan'])
    default = get_categorical(df['default'])
    housing = get_categorical(df['housing'])
    contact_years = [get_year(contact) for contact in df['contact_date']]
    contact = get_categorical(contact_years)
    poutcome = get_categorical(df['poutcome'])
    campaign = np.array([df['campaign']])
    previous = np.array([df['previous']])
    contact_type = get_categorical(df['contact'])
        
    return np.concatenate([ages, jobs, education, marital, loan, default, housing, contact, poutcome,
                           campaign, previous, contact_type], axis=0).T, np.array(df['y'])

In [297]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier

#classifier = RandomForestClassifier(class_weight='balanced')
classifier = AdaBoostClassifier(base_estimator=base)
X, y = get_input_data(train_df)
print(X, y)

[[ 33.   0.   1. ...,   1.   1.   0.]
 [ 36.   1.   0. ...,   0.   0.   1.]
 [ 58.   0.   0. ...,   0.   1.   0.]
 ..., 
 [ 58.   0.   0. ...,   0.   1.   0.]
 [ 37.   0.   0. ...,   0.   0.   1.]
 [ 29.   1.   0. ...,   0.   1.   0.]] [0 0 0 ..., 0 0 0]


In [280]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [298]:
classifier.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [299]:
from sklearn.metrics import roc_auc_score

print(classifier.score(X_val, y_val))

y_pred = classifier.predict_proba(X_val)[::, 1]
print(y_pred)
print(y_val)
print(roc_auc_score(y_val, y_pred))

0.849519230769
[ 0.05359697  0.0307094   0.070911   ...,  0.01184437  0.02760193
  0.07859799]
[0 0 0 ..., 0 0 0]
0.70191333725


In [294]:
from pandas import DataFrame

def return_submission(classifier, path, test_df):
    X_test, _ = get_input_data(test_df)
    classes = classifier.classes_
    predictions = classifier.predict_proba(X_test)
    results_pd = DataFrame({'id': test_df['id'], 'y': predictions[::, 1]})
    results_pd = results_pd.sort_values('id')
    results_pd.to_csv(path, index=False)

return_submission(classifier, 'my_submission.csv', test_df)

In [295]:
!head my_submission.csv

id,y
2,0.11032912546221481
4,0.16127864972142605
5,0.16133373870247456
7,0.3172618292260777
10,0.5121092606583669
12,0.5121092606583669
14,0.10295645195520924
16,0.1587737655716813
18,0.0170387173745394
