In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import sklearn
import pandas as pd

import shelter
from shelter.config import data_dir

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

%matplotlib inline

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from sklearn.pipeline import make_pipeline
import category_encoders

# Machine Learning Model

In this hackathon we'll try to predict the outcome of animals (adoption, etc.) at the Austin Animal Center using intake data (breed, age, etc.).
We'll use the data from [this Kaggle competition](https://www.kaggle.com/c/shelter-animal-outcomes).
At the end of the hackathon you should be able to send your own submission to Kaggle!

To start, read the documentation on [Kaggle](https://www.kaggle.com/c/shelter-animal-outcomes) and download the [data](https://www.kaggle.com/c/shelter-animal-outcomes/data).
Unzip the data in the folder `data/`.
There should be (at least) three files: `sample_submission.csv`, `train.csv` and `test.csv`.

Load the data with the functions from our own `shelter` package:

In [None]:
data_dir = "C://projects//gdd//ml-production//data"

train = shelter.data.load_data(os.path.join(data_dir, 'train.csv'))
test = shelter.data.load_data(os.path.join(data_dir, 'test.csv'))

train.head()

In [None]:
ax = train['outcome_type'].value_counts().plot(kind='bar', rot=45)
ax.set_ylabel('# animals')
ax.set_title('Occurrence of outcome types')

Now that you've got the data, try to create a model that is able to predict the `outcome_type` given the intake data.
Our final metric is the `f1-score` over all classes.

> #### Tips
> 
* First create a baseline model that randomly predicts a class given the class occurrences.
* `sklearn` doesn't work with string values, you probably want to look at [`pd.get_dummies()`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html), `sklearn`'s [`LabelEncoder`](http://scikit-learn.org/stable/modules/preprocessing_targets.html) or [`OneHotEncoder`](http://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features).
* Try to a create model that predicts only one outcome type (e.g. `Adoption`) before focussing on all outcomes.
* `sklearn` has many models for [supervised learning](http://scikit-learn.org/stable/supervised_learning.html), try to find one that fits the problem.
* Look at [Kaggle Kernels](https://www.kaggle.com/c/shelter-animal-outcomes/kernels) for inspiration.
* You will get better performance with some [feature engineering](https://machinelearningmastery.com/discover-feature-engineering-how-to-engineer-features-and-how-to-get-good-at-it/).
* Once you got your first model working, generate predictions for `test.csv` and submit it on Kaggle.

# Pepare train-test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train
                                                    , train['outcome_type']
                                                    , test_size=0.20 
                                                    , random_state=42)
        
#X_train.head()


# Baseline Model: stratified dummy classification

In [None]:
# X = pd.get_dummies(train[['animal_type', 'sex_upon_outcome', 'age_upon_outcome', 'breed', 'color']])
X = [[0]]*len(X_train)
y = y_train #['outcome_type'].tolist()

prediction = DummyClassifier(strategy = "stratified", random_state=None, constant=None)
prediction.fit(X,y)
y_pred = prediction.predict([[0]]*len(X_test))

In [None]:
# F1 _score
f1_score(y_test,y_pred, average = 'macro')

In [None]:
# Accuracy
accuracy_score(y_test,y_pred)

# Simple Model:
### Using animal type as a single predictor

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = pd.get_dummies(X_train[['animal_type']])
y = y_train

prediction = RandomForestClassifier()
prediction.fit(X,y)
y_pred = prediction.predict(pd.get_dummies(X_test[['animal_type']]))

In [None]:
# F1 _score
f1_score(y_test,y_pred, average = 'macro')

In [None]:
# Accuracy
accuracy_score(y_test,y_pred)

# Second Model:
### Clean dataset

In [None]:
import shelter

In [None]:
data_dir = "/Users/janellezoutkamp/Documents/practice/accelerator/ml-production/data"

train = shelter.data.load_data(os.path.join(data_dir, 'train.csv'))
test = shelter.data.load_data(os.path.join(data_dir, 'test.csv'))

In [None]:
train_df = shelter.data.add_features(train)
score_df = shelter.data.add_features(test)

In [None]:
train.head()

In [None]:
# classifying dogs into most frequent 10

N_TOP = 20
 
top_breeds = train_df['breed'].value_counts().index[:N_TOP]
is_top = train_df['breed'].isin(top_breeds)

# breeds = train_df.loc[is_top]

In [None]:
train_df['breed'].loc[~is_top] = 'Other'

# Model with added features

In [None]:
pred_var = ['breed','is_dog', 'has_name', 'sex', 'neutered', 'hair_type', 'days_upon_outcome']

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(train_df[pred_var]
                                                    , train_df['outcome_type']
                                                    , test_size=0.20 
                                                    , random_state=42)

In [None]:
X_train.head()

In [None]:
enc = category_encoders.OneHotEncoder(cols = ['is_dog', 'has_name', 'sex', 'neutered', 'hair_type', 'breed']
                                     ,handle_unknown = 'ignore').fit(X_train, y_train)

X_train_numeric = enc.transform(X_train)

reference_var = X_train_numeric.columns.str.endswith('_0')
X_train_numeric = X_train_numeric.loc[:, ~reference_var]

In [None]:
X_test_numeric = enc.transform(X_test)
X_test_numeric.head()

reference_var = X_test_numeric.columns.str.endswith('_0')
X_test_numeric = X_test_numeric.loc[:, ~reference_var]

In [None]:
for _ in X_train_numeric.columns:
    print("The number of null values in:{} == {}".format(_, X_train_numeric[_].isnull().sum()))

In [None]:
X_train_numeric['days_upon_outcome'] = X_train_numeric['days_upon_outcome'].fillna(9999)
X_test_numeric['days_upon_outcome'] = X_test_numeric['days_upon_outcome'].fillna(9999)

In [None]:
pipe = make_pipeline(RandomForestClassifier())

In [None]:
param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30, 40],
             "randomforestclassifier__max_depth" : [None, 6, 8, 10, 4],
             "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20, 15], 
             "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3, 0.5]}

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)

In [None]:
grid.fit(X_train_numeric, y_train)

In [None]:
print("Best parameters: {}".format(grid.best_params_))

In [None]:
print("Test set score: {:.2f}".format(grid.score(X_test_numeric, y_test)))

In [None]:
f1_score(y_test, grid.predict(X_test_numeric), average = 'macro')

In [None]:
# Accuracy
accuracy_score(y_test,y_pred)

In [None]:
# KAGGLE SUBMISSION

test = shelter.data.load_data(os.path.join(data_dir, 'test.csv'))

score_df = shelter.data.add_features(test)

N_TOP = 20
is_top = score_df['breed'].isin(top_breeds)
score_df['breed'].loc[~is_top] = 'Other'

pred_var = ['breed','is_dog', 'has_name', 'sex', 'neutered', 'hair_type', 'days_upon_outcome']
score_df = score_df[pred_var]

score_numeric = enc.transform(score_df)

reference_var = score_numeric.columns.str.endswith('_0')

In [None]:
score_numeric = score_numeric.loc[:, ~reference_var]

In [None]:
score_numeric['days_upon_outcome'] = score_numeric['days_upon_outcome'].fillna(9999)

In [None]:
score_numeric.columns

In [None]:
X_train_numeric.columns

In [None]:
prediction = grid.predict_proba(score_numeric)

In [None]:
submission = pd.DataFrame(prediction, columns=grid.classes_)
submission['ID'] = test['id']

In [None]:
reordered = submission.loc[:, ['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']]

In [None]:
reordered.to_csv('kaggle.csv', index=False)

In [None]:
kaggle.to_csv()

In [None]:
train_clean = train[['outcome_type', 'animal_type', 'sex_upon_outcome', 'age_upon_outcome', 'breed', 'color']]
train_clean.head()

In [None]:
# train_clean['age_upon_outcome'].str.split('')
train_clean['age'], train_clean['unit'] = train_clean.age_upon_outcome.str.split(' ', 1).str
train_clean.head()

In [None]:
?DummyClassifier