In [2]:
import pandas as pd
import numpy as np
import pickle

import matplotlib as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import log_loss, accuracy_score



In [3]:
c_train = pd.read_csv("clean_train.csv")
c_test = pd.read_csv("clean_test.csv")

In [4]:
def label_encode(category):
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(category)
    return integer_encoded

In [5]:
def one_hot(data):
    data = data.reshape(len(data), 1)
    onehot_encoder = OneHotEncoder(sparse=False)
    data = onehot_encoder.fit_transform(data)
    return data

In [6]:
c_train['SexuponOutcome'] = c_train.IsAltered + c_train.Gender
c_test['SexuponOutcome'] = c_test.IsAltered + c_test.Gender

In [7]:
# Select features you want in model
wanted_features = ['AnimalType', 'SexuponOutcome', 'HasName', 'Year',
                  'Month', 'Day', 'Hour', 'DayofWeek',
                  'BreedType', 'CleanAge']

In [8]:
test = c_test[wanted_features]

In [9]:
# Encode all categorical data
test.AnimalType = label_encode(test.AnimalType)
#test.Gender = label_encode(test.Gender)
test.HasName = label_encode(test.HasName)
#test.IsAltered = label_encode(test.IsAltered)
test.BreedType = label_encode(test.BreedType)
test.SexuponOutcome = label_encode(test.SexuponOutcome)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [10]:
test.CleanAge = np.log(test.CleanAge+1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [11]:
# Seperate labels from features.  Delete OutcomeSubtype
labels = c_train.OutcomeType
features = c_train[wanted_features]

In [12]:
features.head()

Unnamed: 0,AnimalType,SexuponOutcome,HasName,Year,Month,Day,Hour,DayofWeek,BreedType,CleanAge
0,Dog,AlteredMale,True,2014,2,12,18,2,Mixed Breed,1.0
1,Cat,AlteredFemale,True,2013,10,13,12,6,Mixed Breed,1.0
2,Dog,AlteredMale,True,2015,1,31,12,5,Mixed Breed,2.0
3,Cat,IntactMale,False,2014,7,11,19,4,Mixed Breed,0.057692
4,Dog,AlteredMale,False,2013,11,15,12,4,Full Breed,2.0


In [13]:
labels = label_encode(labels)

In [14]:
features.AnimalType = label_encode(features.AnimalType)
#features.Gender = label_encode(features.Gender)
features.HasName = label_encode(features.HasName)
#features.IsAltered = label_encode(features.IsAltered)
features.BreedType = label_encode(features.BreedType)
features.SexuponOutcome = label_encode(features.SexuponOutcome)
features.CleanAge = np.log(features.CleanAge + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [18]:
# Run model if it exists
try:
    classifier_f = open("bag_log_reg.pickle", "rb")
    classifier = pickle.load(classifier_f)
    classifier_f.close()
    lrclf = classifier.best_estimator_
    print(classifier.best_params_)
    print(lrclf)
    
except OSError:
    logreg = LogisticRegression(random_state = 1123)
    skb = SelectKBest(f_classif)

    pipeline = Pipeline(steps=[("SKB",skb),
                                   ("LR", logreg)])

    params = {'SKB__k':[8],
              'LR__C': [10]}

    split = StratifiedShuffleSplit(labels, test_size=0.25, random_state=42)

    gs = GridSearchCV(pipeline, params, cv = split, scoring = 'neg_log_loss')

    gs.fit(features,labels)
    lrclf=gs.best_estimator_
    print(gs.best_params_)
    print(gs.best_score_)
    print(lrclf)

    save_classifier = open("bag_log_reg.pickle","wb")
    pickle.dump(gs, save_classifier)
    save_classifier.close()

Loaded saved pickle
{'LR__C': 10, 'SKB__k': 8}
Pipeline(memory=None,
     steps=[('SKB', SelectKBest(k=8, score_func=<function f_classif at 0x000001E735044A60>)), ('LR', LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [16]:
classifier_f = open("ba_log_reg.pickle", "rb")

FileNotFoundError: [Errno 2] No such file or directory: 'ba_log_reg.pickle'

In [70]:
#print(classifier.best_estimator_.named_steps['SKB'].get_support())
print(classifier.best_estimator_.named_steps['LR'].coef_)

[[ -2.41656881e-01  -1.21501281e+00   7.86560500e-01  -5.52105409e-04
    1.43339094e-01   1.32109171e-01  -1.48153667e-01  -1.07357871e+00]
 [ -3.77037842e-01   5.62389194e-01  -2.03506306e-01  -1.87372245e-03
   -1.31677389e-01   2.26124909e-02   6.27904312e-02  -3.07182344e-01]
 [ -8.44070956e-02   5.39012570e-01  -1.07754509e+00  -1.99095205e-03
   -9.89555753e-03  -7.38913216e-02   2.40291817e-01   1.21473149e+00]
 [  1.50205858e+00   1.29925760e-01   2.08042556e+00  -3.15470151e-03
    5.26583388e-02  -6.13907502e-02   1.70084003e-01   9.93103668e-01]
 [ -4.58192365e-01   6.39040096e-01  -9.09309178e-01   9.36946186e-04
   -1.45016381e-01  -7.16371158e-02   5.85664773e-04  -1.75310116e-01]]


In [20]:
try:
    classifier_f = open("rf.pickle", "rb")
    classifier = pickle.load(classifier_f)
    classifier_f.close()
    
    rfclf = classifier.best_estimator_
    print(classifier.best_params_)
    print(rfclf)
except OSError:
    rf = RandomForestClassifier(random_state=1122)
    skb = SelectKBest(f_classif)

    pipeline = Pipeline(steps=[("SKB",skb),
                               ("RF", rf)])

    params = {'SKB__k':[9],
              'RF__n_estimators': [500],
              'RF__min_samples_leaf': [10],
             }

    split = StratifiedShuffleSplit(labels, test_size=0.25, random_state=42)

    gs = GridSearchCV(pipeline, params, cv = split, scoring = 'neg_log_loss')

    gs.fit(features,labels)
    rfclf=gs.best_estimator_
    print(gs.best_params_)
    print(gs.best_score_)
    print(rfclf)
    
    save_classifier = open("rf.pickle","wb")
    pickle.dump(gs, save_classifier)
    save_classifier.close()

{'RF__min_samples_leaf': 10, 'RF__n_estimators': 500, 'SKB__k': 9}
Pipeline(memory=None,
     steps=[('SKB', SelectKBest(k=9, score_func=<function f_classif at 0x000001E735044A60>)), ('RF', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=...n_jobs=1,
            oob_score=False, random_state=1122, verbose=0,
            warm_start=False))])


In [67]:
#print(classifier.best_estimator_.named_steps['SKB'].get_support())
print(classifier.best_estimator_.named_steps['RF'].feature_importances_ )

[ 0.05166747  0.29776381  0.10191834  0.02387635  0.05247086  0.1473534
  0.06065115  0.00744333  0.25685528]


In [21]:
try:
    classifier_f = open("xgb.pickle", "rb")
    classifier = pickle.load(classifier_f)
    classifier_f.close()
    
    xgclf = classifier.best_estimator_
    print(classifier.best_params_)
    print(xgclf)
except OSError:
    xgbc = XGBClassifier(random_state = 1123)
    skb = SelectKBest(f_classif)

    pipeline = Pipeline(steps=[("SKB",skb),
                               ("XGB", xgbc)])

    params = {'SKB__k':[10],
              'XGB__max_depth': [7],
              'XGB__min_child_weight':[4],
              #'XGB__gamma':[0.1]
             }

    split = StratifiedShuffleSplit(labels, test_size=0.25, random_state=42)

    gs = GridSearchCV(pipeline, params, cv = split, scoring = 'neg_log_loss')

    gs.fit(features,labels)
    xgclf=gs.best_estimator_
    print(gs.best_params_)
    print(gs.best_score_)
    print(xgclf)
    
    save_classifier = open("xgb.pickle","wb")
    pickle.dump(gs, save_classifier)
    save_classifier.close()

{'SKB__k': 10, 'XGB__max_depth': 7, 'XGB__min_child_weight': 4}
Pipeline(memory=None,
     steps=[('SKB', SelectKBest(k=10, score_func=<function f_classif at 0x000001E735044A60>)), ('XGB', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=4, missing=nan, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob',
       random_state=1123, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])


In [22]:
print(classifier.best_estimator_.named_steps['XGB'].feature_importances_ )

[ 0.04797134  0.05231729  0.03224629  0.06382908  0.13432638  0.19633083
  0.17075275  0.11203264  0.01542647  0.17476694]


In [23]:
results = classifier.cv_results_
print(results)

{'mean_fit_time': array([ 8.83131413]), 'std_fit_time': array([ 0.27982472]), 'mean_score_time': array([ 0.28456826]), 'std_score_time': array([ 0.09573818]), 'param_SKB__k': masked_array(data = [10],
             mask = [False],
       fill_value = ?)
, 'param_XGB__max_depth': masked_array(data = [7],
             mask = [False],
       fill_value = ?)
, 'param_XGB__min_child_weight': masked_array(data = [4],
             mask = [False],
       fill_value = ?)
, 'params': [{'SKB__k': 10, 'XGB__max_depth': 7, 'XGB__min_child_weight': 4}], 'split0_test_score': array([-0.76066293]), 'split1_test_score': array([-0.74394925]), 'split2_test_score': array([-0.74782004]), 'split3_test_score': array([-0.7546434]), 'split4_test_score': array([-0.76771505]), 'split5_test_score': array([-0.74557594]), 'split6_test_score': array([-0.74298598]), 'split7_test_score': array([-0.76266297]), 'split8_test_score': array([-0.75722809]), 'split9_test_score': array([-0.75736355]), 'mean_test_score': array([

In [73]:
predict1 = lrclf.predict_proba(test)
predict2 = xgclf.predict_proba(test)
predict3 = rfclf.predict_proba(test)
final_prediction = (predict3 + predict2)/2

In [74]:
output = pd.DataFrame(final_prediction,columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])
output.index.names = ['ID']
output.index += 1
output['ID'] = output.index
output.to_csv('predictions.csv', columns = ['ID','Adoption','Died','Euthanasia','Return_to_owner','Transfer'], index = False)