In [1]:
import numpy as np
import pandas as pd
import sklearn
from datetime import datetime
from sklearn import linear_model, feature_extraction, preprocessing, cross_validation, grid_search, metrics, svm, tree, ensemble

In [2]:
train_data = pd.read_csv("train.csv", index_col='AnimalID')
test_data = pd.read_csv("test.csv", index_col='ID')
print(train_data.shape)
print(test_data.shape)

(26729, 9)
(11456, 7)


In [3]:
def flatten(df, fields):
    for field in fields:
        df = df.join(pd.get_dummies(df[field]), lsuffix='_left', rsuffix='_right')
    return remove(df, fields)

def remove(df, fields):
    return df.drop(fields, axis=1)

def convert_date(dt):
    d = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
    return d.year, d.month, d.isoweekday()

In [86]:
all_data = pd.concat([train_data, test_data])

In [54]:
ll = list(map(lambda x: x.split('/'), all_data['Color'].unique()))
l = [item for sublist in ll for item in sublist]
full_color = set(map(lambda x: x.replace(' Mix', ''), l))

In [55]:
ll = list(map(lambda x: x.split('/'), all_data['Breed'].unique()))
l = [item for sublist in ll for item in sublist]
full_breed = set(map(lambda x: x.replace(' Mix', ''), l))

In [108]:
def prepare_breed(df: pd.DataFrame):
    df['BreedMix'] = df['Breed'].str.contains('Mix').apply(lambda x: int(x))
    df['Breed'] = df['Breed'].apply(lambda x: x.replace(' Mix', '').split('/'))
    df['BreedCount'] = df['Breed'].apply(lambda x: len(x))
    tmp = pd.DataFrame()
    for breed in full_breed:
        tmp[breed] = df['Breed'].apply(lambda x: int(breed in x))   
    df = remove(df, ['Breed'])  
    return pd.concat([df, tmp], axis=1)

In [109]:
def to_days(x):
    s = x.split(' ')
    r = 0
    if s[1] in ('year', 'years'):
        r = int(s[0]) * 365
    elif s[1] in ('month', 'months'):
        r = int(s[0]) * 30
    elif s[1] in ('week', 'weeks'):
        r = int(s[0]) * 7
    else:
        r = int(s[0])

    return r


def prepare_age(df: pd.DataFrame):
    df['AgeuponOutcome'].fillna('0 months', inplace=True)
    df['AgeuponOutcome'] = df['AgeuponOutcome'].apply(to_days)
    df['AgeuponOutcome'] = pd.cut(df['AgeuponOutcome'], bins=[0, 90, 180, 365, 730, 1460, 2190, 10000])
    return df

In [110]:
def has_name(x):
    return 0 if x == 'Unknown' else 1


def prepare_name(df: pd.DataFrame):
    df['Name'].fillna('Unknown', inplace=True)
    df['Name'] = df['Name'].apply(has_name)
    return df

In [111]:
def prepare_date(df: pd.DataFrame):
    df["Year"], df["Month"], df["WeekDay"] = zip(*df["DateTime"].map(convert_date))
    return df

In [112]:
all_data = pd.concat([train_data, test_data])
all_df = prepare_age(all_data)
all_df = prepare_name(all_df)
all_df = prepare_date(all_df)
all_df = prepare_breed(all_df)
all_df[:3]

Unnamed: 0,AgeuponOutcome,AnimalType,Color,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Year,Month,...,Rex,Yorkshire,American Bulldog,Spanish Mastiff,Scottish Terrier,English Pointer,Port Water Dog,West Highland,Miniature Pinscher,Italian Greyhound
A671945,"(180, 365]",Dog,Brown/White,2014-02-12 18:22:00,1,,Return_to_owner,Neutered Male,2014,2,...,0,0,0,0,0,0,0,0,0,0
A656520,"(180, 365]",Cat,Cream Tabby,2013-10-13 12:44:00,1,Suffering,Euthanasia,Spayed Female,2013,10,...,0,0,0,0,0,0,0,0,0,0
A686464,"(365, 730]",Dog,Blue/White,2015-01-31 12:28:00,1,Foster,Adoption,Neutered Male,2015,1,...,0,0,0,0,0,0,0,0,0,0


In [113]:
categorical_features = ['AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Year', 'Month', 'WeekDay']
remove_features = ['DateTime', 'OutcomeSubtype', 'OutcomeType', 'Color']
all_X = flatten(all_df, categorical_features)

train_X = all_X[all_X['OutcomeType'].notnull()]
test_X = all_X[all_X['OutcomeType'].isnull()]

le = preprocessing.LabelEncoder()
train_y = le.fit_transform(train_X['OutcomeType'].astype('category'))

train_X = remove(train_X, remove_features)
test_X = remove(test_X, remove_features)


print(train_X.shape)
print(test_X.shape)
train_X[:3]

(26729, 277)
(11456, 277)


Unnamed: 0,Name,BreedMix,BreedCount,Bernese Mountain Dog,Munchkin Shorthair,Boerboel,German Wirehaired Pointer,Angora,Canaan Dog,Samoyed,...,10,11,12,1_right,2_right,3_right,4_right,5_right,6_right,7_right
A671945,1,1,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A656520,1,1,1,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
A686464,1,1,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [114]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_X, train_y, test_size=0.3, random_state=248)

In [115]:
parameters = {'n_estimators': [400, 500, 550]}
model = ensemble.RandomForestClassifier(n_jobs=3)

print("# Tuning hyper-parameters")
print()

clf = grid_search.GridSearchCV(model, parameters)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(metrics.classification_report(y_true, y_pred))
print()

# Tuning hyper-parameters

Best parameters set found on development set:

{'n_estimators': 550}

Grid scores on development set:

0.636 (+/-0.003) for {'n_estimators': 400}
0.635 (+/-0.008) for {'n_estimators': 500}
0.637 (+/-0.007) for {'n_estimators': 550}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.67      0.79      0.72      3278
          1       0.38      0.05      0.09        60
          2       0.44      0.14      0.21       449
          3       0.48      0.43      0.45      1448
          4       0.71      0.67      0.69      2784

avg / total       0.63      0.64      0.63      8019




In [116]:
clf.best_estimator_.score(X_test, y_test)

0.64347175458286565

In [94]:
# predict = le.inverse_transform(clf.best_estimator_.predict_proba(test_X))
predict = clf.best_estimator_.predict_proba(test_X)

In [95]:
predict[:5]

array([[ 0.104     ,  0.        ,  0.04      ,  0.404     ,  0.452     ],
       [ 0.754     ,  0.        ,  0.004     ,  0.132     ,  0.11      ],
       [ 0.42986667,  0.        ,  0.002     ,  0.034     ,  0.53413333],
       [ 0.242     ,  0.002     ,  0.036     ,  0.2       ,  0.52      ],
       [ 0.47      ,  0.        ,  0.006     ,  0.268     ,  0.256     ]])

In [68]:
output = pd.read_csv("sample_submission.csv")
output['Adoption'], output['Died'], output['Euthanasia'], output['Return_to_owner'], output['Transfer'] = predict[:,0], predict[:,1], predict[:,2], predict[:,3], predict[:,4]
output.to_csv("actual_submission.csv", index=False)