In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import log_loss, accuracy_score

In [3]:
c_train = pd.read_csv("clean_train.csv")
c_test = pd.read_csv("clean_test.csv")

In [4]:
def label_encode(category):
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(category)
    return integer_encoded

In [5]:
def one_hot(data):
    data = data.reshape(len(data), 1)
    onehot_encoder = OneHotEncoder(sparse=False)
    data = onehot_encoder.fit_transform(data)
    return data

In [6]:
c_train['SexuponOutcome'] = c_train.IsAltered + c_train.Gender
c_test['SexuponOutcome'] = c_test.IsAltered + c_test.Gender

In [7]:
# Select features you want in model
wanted_features = ['AnimalType', 'SexuponOutcome', 'HasName', 'Year',
                  'Month', 'Day', 'Hour', 'DayofWeek',
                  'BreedType', 'CleanAge']

In [8]:
test = c_test[wanted_features]

In [9]:
# Encode all categorical data
test.AnimalType = label_encode(test.AnimalType)
#test.Gender = label_encode(test.Gender)
test.HasName = label_encode(test.HasName)
#test.IsAltered = label_encode(test.IsAltered)
test.BreedType = label_encode(test.BreedType)
test.SexuponOutcome = label_encode(test.SexuponOutcome)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [10]:
test.CleanAge = np.log(test.CleanAge+1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [11]:
# Seperate labels from features.  Delete OutcomeSubtype
labels = c_train.OutcomeType
features = c_train[wanted_features]

In [12]:
features.head()

Unnamed: 0,AnimalType,SexuponOutcome,HasName,Year,Month,Day,Hour,DayofWeek,BreedType,CleanAge
0,Dog,AlteredMale,True,2014,2,12,18,2,Mixed Breed,1.0
1,Cat,AlteredFemale,True,2013,10,13,12,6,Mixed Breed,1.0
2,Dog,AlteredMale,True,2015,1,31,12,5,Mixed Breed,2.0
3,Cat,IntactMale,False,2014,7,11,19,4,Mixed Breed,0.057692
4,Dog,AlteredMale,False,2013,11,15,12,4,Full Breed,2.0


In [13]:
labels = label_encode(labels)

In [14]:
features.AnimalType = label_encode(features.AnimalType)
#features.Gender = label_encode(features.Gender)
features.HasName = label_encode(features.HasName)
#features.IsAltered = label_encode(features.IsAltered)
features.BreedType = label_encode(features.BreedType)
features.SexuponOutcome = label_encode(features.SexuponOutcome)
features.CleanAge = np.log(features.CleanAge + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [15]:
# Split data for validation
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [16]:
y_test = one_hot(y_test)

In [19]:
n_est = 100

logreg = LogisticRegression(C = 10, random_state = 1123)
bag = BaggingClassifier(base_estimator = logreg, n_estimators = n_est, random_state = 1123)
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb),
                           ("BC", bag)])

params = {'SKB__k':[8],
         #'LR__C': [10]
         }

split = StratifiedShuffleSplit(labels, test_size=0.25, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'neg_log_loss')
    
gs.fit(features,labels)
lrclf=gs.best_estimator_
print(gs.best_params_)
print(gs.best_score_)
print(lrclf)

{'SKB__k': 8}
-0.903381994704
Pipeline(memory=None,
     steps=[('SKB', SelectKBest(k=8, score_func=<function f_classif at 0x00000212A7CA6598>)), ('BC', BaggingClassifier(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2...stimators=100, n_jobs=1, oob_score=False,
         random_state=1123, verbose=0, warm_start=False))])


In [24]:
rf = RandomForestClassifier(random_state=1122)
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb),
                           ("RF", rf)])

params = {'SKB__k':[9],
          'RF__n_estimators': [500],
          'RF__min_samples_leaf': [10],
         }

split = StratifiedShuffleSplit(labels, test_size=0.25, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'neg_log_loss')
    
gs.fit(features,labels)
rfclf=gs.best_estimator_
print(gs.best_params_)
print(gs.best_score_)
print(rfclf)

{'RF__min_samples_leaf': 10, 'RF__n_estimators': 500, 'SKB__k': 9}
-0.773469857033
Pipeline(memory=None,
     steps=[('SKB', SelectKBest(k=9, score_func=<function f_classif at 0x00000212A7CA6598>)), ('RF', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=...n_jobs=1,
            oob_score=False, random_state=1122, verbose=0,
            warm_start=False))])


In [20]:
xgbc = XGBClassifier(random_state = 1123)
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb),
                           ("XGB", xgbc)])

params = {'SKB__k':[10],
          'XGB__max_depth': [7],
          'XGB__min_child_weight':[4],
          #'XGB__gamma':[0.1]
         }

split = StratifiedShuffleSplit(labels, test_size=0.25, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'neg_log_loss')
    
gs.fit(features,labels)
xgclf=gs.best_estimator_
print(gs.best_params_)
print(gs.best_score_)
print(xgclf)

{'SKB__k': 10, 'XGB__max_depth': 7, 'XGB__min_child_weight': 4}
-0.754060719518
Pipeline(memory=None,
     steps=[('SKB', SelectKBest(k=10, score_func=<function f_classif at 0x00000212A7CA6598>)), ('XGB', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=4, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob',
       random_state=1123, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])


In [27]:
predict1 = lrclf.predict_proba(test)
predict2 = xgclf.predict_proba(test)
predict3 = rfclf.predict_proba(test)
final_prediction = (predict3 + predict2)/2

In [28]:
output = pd.DataFrame(final_prediction,columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])
output.index.names = ['ID']
output.index += 1
output['ID'] = output.index
output.to_csv('predictions.csv', columns = ['ID','Adoption','Died','Euthanasia','Return_to_owner','Transfer'], index = False)