In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
import seaborn as sns
import sklearn
import xgboost as xgb
from datetime import datetime
from sklearn import linear_model, feature_extraction, preprocessing, cross_validation, grid_search, metrics, svm, tree, ensemble, neighbors

import utils

%matplotlib inline

In [2]:
train_data = pd.read_csv("train.csv", index_col='AnimalID')
test_data = pd.read_csv("test.csv", index_col='ID')
all_data = pd.concat([train_data, test_data])
print(train_data.shape)
print(test_data.shape)

(26729, 9)
(11456, 7)


In [3]:
ll = list(map(lambda x: x.split('/'), all_data['Color'].unique()))
l = [item for sublist in ll for item in sublist]
full_color = set(map(lambda x: x.replace(' Mix', ''), l))

In [4]:
ll = list(map(lambda x: x.split('/'), all_data['Breed'].unique()))
l = [item for sublist in ll for item in sublist]
full_breed = set(map(lambda x: x.replace(' Mix', ''), l))

In [5]:
all_data = pd.concat([train_data, test_data])
all_df = utils.prepare_age(all_data)
all_df = utils.prepare_name(all_df)
all_df = utils.prepare_date(all_df)
all_df = utils.prepare_breed(all_df, full_breed)
all_df = utils.prepare_color(all_df, full_color)
all_df[:3]

Unnamed: 0,AgeuponOutcome,AnimalType,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Year,Month,WeekDay,...,Color Gray Tiger,Color Liver Tick,Color Blue Tabby,Color Cream,Color Blue,Color Orange Tabby,Color Tortie Point,Color Fawn,Color Calico Point,Color Brown
A671945,"(330, 730)",Dog,2014-02-12 18:22:00,1,,Return_to_owner,Neutered Male,2014,2,3,...,0,0,0,0,0,0,0,0,0,1
A656520,"(330, 730)",Cat,2013-10-13 12:44:00,1,Suffering,Euthanasia,Spayed Female,2013,10,7,...,0,0,0,0,0,0,0,0,0,0
A686464,"(330, 730)",Dog,2015-01-31 12:28:00,1,Foster,Adoption,Neutered Male,2015,1,6,...,0,0,0,0,1,0,0,0,0,0


In [None]:
train = all_df[all_df['OutcomeType'].notnull()]
train['OutcomeType'].unique()

In [None]:
age = train[['OutcomeType', 'AgeuponOutcome']]
grouped = age[train['OutcomeType'] == 'Euthanasia'].groupby(['OutcomeType', 'AgeuponOutcome'])
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 20, 10
grouped.size().plot.bar()

In [None]:
age = train[['OutcomeType', 'AgeuponOutcome']]
grouped = age[train['OutcomeType'] == 'Adoption'].groupby(['OutcomeType', 'AgeuponOutcome'])
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 20, 10
grouped.size().plot.bar()

In [None]:
age = train[['OutcomeType', 'AgeuponOutcome']]
grouped = age[train['OutcomeType'] == 'Transfer'].groupby(['OutcomeType', 'AgeuponOutcome'])
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 20, 10
grouped.size().plot.bar()

In [None]:
age = train[['OutcomeType', 'AgeuponOutcome']]
grouped = age[train['OutcomeType'] == 'Return_to_owner'].groupby(['OutcomeType', 'AgeuponOutcome'])
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 20, 10
grouped.size().plot.bar()

In [None]:
age = train[['OutcomeType', 'AgeuponOutcome']]
grouped = age[train['OutcomeType'] == 'Died'].groupby(['OutcomeType', 'AgeuponOutcome'])
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 20, 10
grouped.size().plot.bar()

In [None]:
age = train[['OutcomeType', 'AgeuponOutcome']]
grouped = age.groupby(['OutcomeType'])
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 20, 10
grouped.size().plot.bar()

In [6]:
categorical_features = ['AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Year', 'Month', 'WeekDay']
remove_features = ['DateTime', 'OutcomeSubtype', 'OutcomeType']
all_X = utils.flatten(all_df, categorical_features)

train_X = all_X[all_X['OutcomeType'].notnull()]
test_X = all_X[all_X['OutcomeType'].isnull()]

le = preprocessing.LabelEncoder()
train_y = le.fit_transform(train_X['OutcomeType'].astype('category'))

train_X = utils.remove(train_X, remove_features)
test_X = utils.remove(test_X, remove_features)


print(train_X.shape)
print(test_X.shape)
# for col in sorted(list(map(lambda x: str(x), train_X.columns))):
#     print(col)
train_X[:3]

(26729, 338)
(11456, 338)


Unnamed: 0,Name,BreedMix,BreedCount,Breed Sphynx,Breed Dachshund Longhair,Breed Bichon Frise,Breed Old English Sheepdog,Breed Jindo,Breed Whippet,Breed Manchester Terrier,...,10,11,12,1_right,2_right,3_right,4_right,5_right,6_right,7_right
A671945,1,1,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A656520,1,1,1,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
A686464,1,1,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_X, train_y, test_size=0.4)
poly = preprocessing.PolynomialFeatures(2)
# X_train = poly.fit_transform(X_train)
# X_test = poly.fit_transform(X_test)

In [12]:
parameters = {
    'n_estimators': [100],
    'objective': ['multi:softprob'],
    'reg_alpha': [10],
    'reg_lambda': [10],
    'learning_rate': [0.3],
    'max_depth': [6],
}
parameters = {}
model = xgb.XGBClassifier(nthread=3, )
model = neighbors.KNeighborsClassifier(n_jobs=3, n_neighbors=100)

print("# Tuning hyper-parameters", X_train.shape)
print()

clf = grid_search.GridSearchCV(model, parameters)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(metrics.classification_report(y_true, y_pred))
print()

# Tuning hyper-parameters (16037, 338)

Best parameters set found on development set:

{}

Grid scores on development set:

0.621 (+/-0.007) for {}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.58      0.91      0.71      4277
          1       0.00      0.00      0.00        84
          2       0.00      0.00      0.00       632
          3       0.47      0.29      0.36      1953
          4       0.77      0.58      0.66      3746

avg / total       0.59      0.62      0.58     10692




  'precision', 'predicted', average, warn_for)


In [None]:
model = xgb.XGBClassifier(objective='multi:softprob', learning_rate=0.3, n_estimators=400)
train_X = poly.fit_transform(train_X)
test_X = poly.fit_transform(test_X)
model.fit(train_X, train_y)
predict = model.predict_proba(test_X)

In [None]:
predict[:5]

In [None]:
output = pd.read_csv("sample_submission.csv")
output['Adoption'], output['Died'], output['Euthanasia'], output['Return_to_owner'], output['Transfer'] = predict[:,0], 0, predict[:,2], predict[:,3], predict[:,4]
output.to_csv("actual_submission.csv", index=False)