In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, StandardScaler
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score
from sklearn.model_selection import GridSearchCV

%matplotlib inline
plt.rc('figure', figsize=(20.0, 10.0))

In [None]:
INPUT_DIR = "../input"

In [None]:
print(os.listdir(INPUT_DIR))
train_df = pd.read_csv(os.path.join(INPUT_DIR, 'train', 'train.csv'))
X_test = pd.read_csv(os.path.join(INPUT_DIR, 'test', 'test.csv'))

## Data description (copied from [competition description](https://www.kaggle.com/c/petfinder-adoption-prediction/data))

<i>
In this competition you will predict the speed at which a pet is adopted, based on the pet’s listing on PetFinder. Sometimes a profile represents a group of pets. In this case, the speed of adoption is determined by the speed at which all of the pets are adopted. The data included text, tabular, and image data. See below for details. 
This is a Kernels-only competition. At the end of the competition, test data will be replaced in their entirety with new data of approximately the same size, and your kernels will be rerun on the new data.

### File descriptions
- train.csv - Tabular/text data for the training set
- test.csv - Tabular/text data for the test set
- sample_submission.csv - A sample submission file in the correct format
- breed_labels.csv - Contains Type, and BreedName for each BreedID. Type 1 is dog, 2 is cat.
- color_labels.csv - Contains ColorName for each ColorID
- state_labels.csv - Contains StateName for each StateID

### Data Fields
- PetID - Unique hash ID of pet profile
- AdoptionSpeed - Categorical speed of adoption. Lower is faster. This is the value to predict. See below section for more info.
- Type - Type of animal (1 = Dog, 2 = Cat)
- Name - Name of pet (Empty if not named)
- Age - Age of pet when listed, in months
- Breed1 - Primary breed of pet (Refer to BreedLabels dictionary)
- Breed2 - Secondary breed of pet, if pet is of mixed breed (Refer to BreedLabels dictionary)
- Gender - Gender of pet (1 = Male, 2 = Female, 3 = Mixed, if profile represents group of pets)
- Color1 - Color 1 of pet (Refer to ColorLabels dictionary)
- Color2 - Color 2 of pet (Refer to ColorLabels dictionary)
- Color3 - Color 3 of pet (Refer to ColorLabels dictionary)
- MaturitySize - Size at maturity (1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified)
- FurLength - Fur length (1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified)
- Vaccinated - Pet has been vaccinated (1 = Yes, 2 = No, 3 = Not Sure)
- Dewormed - Pet has been dewormed (1 = Yes, 2 = No, 3 = Not Sure)
- Sterilized - Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)
- Health - Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)
- Quantity - Number of pets represented in profile
- Fee - Adoption fee (0 = Free)
- State - State location in Malaysia (Refer to StateLabels dictionary)
- RescuerID - Unique hash ID of rescuer
- VideoAmt - Total uploaded videos for this pet
- PhotoAmt - Total uploaded photos for this pet
- Description - Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese.
- AdoptionSpeed Contestants are required to predict this value. The value is determined by how quickly, if at all, a pet is adopted. The values are determined in the following way: 
    0 - Pet was adopted on the same day as it was listed. 
    1 - Pet was adopted between 1 and 7 days (1st week) after being listed. 
    2 - Pet was adopted between 8 and 30 days (1st month) after being listed. 
    3 - Pet was adopted between 31 and 90 days (2nd & 3rd month) after being listed. 
    4 - No adoption after 100 days of being listed. (There are no pets in this dataset that waited between 90 and 100 days).

### Images

For pets that have photos, they will be named in the format of PetID-ImageNumber.jpg. Image 1 is the profile (default) photo set for the pet. For privacy purposes, faces, phone numbers and emails have been masked.

### Image Metadata
We have run the images through Google's Vision API, providing analysis on Face Annotation, Label Annotation, Text Annotation and Image Properties. You may optionally utilize this supplementary information for your image analysis.

File name format is PetID-ImageNumber.json.

Some properties will not exist in JSON file if not present, i.e. Face Annotation. Text Annotation has been simplified to just 1 entry of the entire text description (instead of the detailed JSON result broken down by individual characters and words). Phone numbers and emails are already anonymized in Text Annotation.

Google Vision API reference: https://cloud.google.com/vision/docs/reference/rest/v1/images/annotate

### Sentiment Data
We have run each pet profile's description through Google's Natural Language API, providing analysis on sentiment and key entities. You may optionally utilize this supplementary information for your pet description analysis. There are some descriptions that the API could not analyze. As such, there are fewer sentiment files than there are rows in the dataset.

File name format is PetID.json.

Google Natural Language API reference: https://cloud.google.com/natural-language/docs/basics

What will change in the 2nd stage of the competition?
In the second stage of the competition, we will re-run your selected Kernels. The following files will be swapped with new data:

test.zip including test.csv and sample_submission.csv
test_images.zip
test_metadata.zip
test_sentiment.zip

In stage 2, all data will be replaced with approximately the same amount of different data. The stage 1 test data will not be available when kernels are rerun in stage 2.
</i>

## Preprocessing

### What we'll do here  for each column
- `PetId`: Keep for reference, drop for prediction
- `Type`: One-hot encode into `isCat` or `isDog` field
- `Name`: Create a field for if name exists or not, drop *Name* column
- `Age`: Leave as is
- `Breed1`: Keep `N` most frequent categories, one-hot encode
- `Breed2`: Keep `N` most frequent categories, one-hot encode
- `Gender`: One-hot encode
- `Color1`, `Color2`, `Color3`: One-hot encode *Color1*, drop others
- `MaturitySize`: One-hot encode, accounting for zero
- `FurLength`: One-hot encode, accounting for zero
- `Vaccinated`: One-hot encode, accounting for 3 (not sure)
- `Dewormed`: One-hot encode, accounting for 3 (not sure)
- `Sterilized`: One-hot encode, accounting for 3 (not sure)
- `Health`: One-hot encode, accounting for 0 (not specified)
- `Quantity`: Leave as is
- `Fee`: Leave as is
- `State`: Keep `N` most frequent categories, one-hot encode
- `VideoAmt`, `PhotosAmt`: Leave as is
- `Description`: Leave as is

### Quick look at the distributions

In [None]:
train_df.hist()
plt.tight_layout()

### Define transforms
We'll use [scikit-learn pipelines](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) to define our data preprocessing transforms. We'll use a few custom transformers for the purpose:
- `DataFrameColumnMapper`: Maps DataFrame columns to a new column (similar to `DataFrameMapper` from `sklearn-pandas`)
- `CategoricalOneHotEncoder`: One-hot encodes categorical columns
- `DataFrameColumnDropper`: Drops given columns
- `DataFrameToValuesTransformer`: Maps DataFrame to NumPy array

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameColumnMapper(BaseEstimator, TransformerMixin):
    def __init__(self, column_name, mapping_func, new_column_name=None, drop_original=True):
        self.column_name = column_name
        self.mapping_func = mapping_func
        self.new_column_name = new_column_name if new_column_name is not None else self.column_name
        self.drop_original = drop_original

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        transformed_column = X.transform({self.column_name: self.mapping_func})
        Y = X.copy()
        Y = Y.assign(**{self.new_column_name: transformed_column})
        if self.column_name != self.new_column_name and self.drop_original:
            Y = Y.drop(self.column_name, axis=1)
        return Y

In [None]:
class CategoricalToOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    def fit(self, X, y=None):
        # Pick all categorical attributes if no columns to transform were specified
        if self.columns is None:
            self.columns = X.select_dtypes(exclude='number')
        
        # Keep track of which categorical attributes are assigned to which integer. This is important 
        # when transforming the test set.
        mappings = {}
        
        for col in self.columns:
            labels, uniques = X.loc[:, col].factorize() # Assigns unique integers for all categories
            int_and_cat = list(enumerate(uniques))
            cat_and_int = [(x[1], x[0]) for x in int_and_cat]
            mappings[col] = {'int_to_cat': dict(int_and_cat), 'cat_to_int': dict(cat_and_int)}
    
        self.mappings = mappings
        return self

    def transform(self, X):
        Y = X.copy()
        for col in self.columns:
            transformed_col = Y.loc[:, col].transform(lambda x: self.mappings[col]['cat_to_int'][x])
            for key, val in self.mappings[col]['cat_to_int'].items():
                one_hot = (transformed_col == val) + 0 # Cast boolean to int by adding zero
                Y = Y.assign(**{'{}_{}'.format(col, key): one_hot})
            Y = Y.drop(col, axis=1)
        return Y

In [None]:
class CategoricalTruncator(BaseEstimator, TransformerMixin):
    def __init__(self, column_name, n_values_to_keep=5):
        self.column_name = column_name
        self.n_values_to_keep = n_values_to_keep
        self.values = None
    def fit(self, X, y=None):
        # Here we must ensure that the test set is transformed similarly in the later phase and that the same values are kept
        self.values = list(X[self.column_name].value_counts()[:self.n_values_to_keep].keys())
        return self
    def transform(self, X):
        transform = lambda x: x if x in self.values else 'Other'
        y = X.transform({self.column_name: transform})
        return X.assign(**{self.column_name: y})

In [None]:
class DataFrameColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, column_names):
        self.column_names = column_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.copy().drop(self.column_names, axis=1)

In [None]:
class DataFrameToValuesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.attributes_ = None
        pass
    def fit(self, X, y=None):
        # Remember the order of attributes before converting to NumPy to ensure the columns
        # are included in the same order when transforming validation or test dataset
        self.attributes_ = list(X)
        return self
    def transform(self, X):
        return X.loc[:, self.attributes_].values

### Split training data into training and validation set

In [None]:
from sklearn.model_selection import train_test_split

def to_features_and_labels(df):
    y = df['AdoptionSpeed'].values
    X = df.drop('AdoptionSpeed', axis=1)
    return X, y

X_train_val, y_train_val = to_features_and_labels(train_df) # All data with labels, to be split into train and val

# Split the available training data into training set and validation set (used for estimating the generalization error)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.20, random_state=42,
                                                  stratify=y_train_val)
X_train.head()

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)

### Define preprocessing pipeline

In [None]:
X_train.info()

In [None]:
def has_field_transformer(column_name, new_column_name=None, is_missing_func=pd.notna) -> TransformerMixin:
    return DataFrameColumnMapper(column_name=column_name,
                                 mapping_func=lambda name: np.int(is_missing_func(name)),
                                 drop_original=True,
                                 new_column_name=new_column_name if new_column_name is not None else column_name)

def value_matches_transformer(column_name, new_column_name=None, matches=pd.notna) -> TransformerMixin:
    return DataFrameColumnMapper(column_name=column_name,
                                 mapping_func=lambda value: np.int(matches(value)),
                                 drop_original=False,
                                 new_column_name=new_column_name if new_column_name is not None else column_name)

def map_categories(column_name, mapping_dict) -> TransformerMixin:
    return DataFrameColumnMapper(column_name=column_name,
                                 mapping_func=lambda x: mapping_dict[x])

def onehot_encode(columns) -> TransformerMixin:
    return CategoricalToOneHotEncoder(columns=columns)

def truncate_categorical(column_name, n_values_to_keep=10):
    return CategoricalTruncator(column_name=column_name, n_values_to_keep=n_values_to_keep)

ONEHOT_ENCODED_COLUMNS = ["Type", "Breed1", "Breed2", "Gender", "Color1", "Health",
                          "FurLength", "Vaccinated", "Dewormed", "Sterilized", "State"]

"""
def has_photo() -> TransformerMixin:
    return DataFrameColumnMapper(column_name="PhotoAmt",
                                                mapping_func=lambda value: np.int(value > 0),
                                                drop_original=False,
                                                new_column_name="hasPhoto")
"""

def build_preprocessing_pipeline() -> Pipeline:
     return Pipeline([
        ('add_has_name', has_field_transformer(column_name="Name", new_column_name="hasName")),
        ('add_is_free', value_matches_transformer(column_name="Fee", new_column_name="isFree",
                                                  matches=lambda value: value < 1)),
        ('map_type_to_species', map_categories(column_name="Type", mapping_dict={1: 'dog', 2: 'cat'})),
        ('map_gender_to_names', map_categories(column_name="Gender", mapping_dict={1: 'male', 2: 'female', 3: 'mixed'})),
        ('truncate_breed1', truncate_categorical(column_name="Breed1", n_values_to_keep=10)),
        ('truncate_breed2', truncate_categorical(column_name="Breed2", n_values_to_keep=10)),
        ('truncate_state', truncate_categorical(column_name="State", n_values_to_keep=5)),
        ('onehot_encode', CategoricalToOneHotEncoder(columns=ONEHOT_ENCODED_COLUMNS)),
        ('drop_unused_columns', DataFrameColumnDropper(
            column_names=['PetID', 'Description', 'RescuerID', 'Color2', 'Color3', 'Type_dog'
        ]))
    ])

preprocessing_pipeline = build_preprocessing_pipeline()
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_val_preprocessed = preprocessing_pipeline.transform(X_val)

X_train_preprocessed.head(20)

### Print the columns:

In [None]:
print("Number of features:", len(list(X_train_preprocessed)))
print("")

# print("Columns:", [(column_name, str(X_train_preprocessed[column_name].dtype))
#                    for column_name in list(X_train_preprocessed)])
print("Numerical columns:", list(X_train_preprocessed.select_dtypes(include="number")))
print("")

print("Non-numerical columns:", list(X_train_preprocessed.select_dtypes(exclude="number")))


### Check that only numerical fields exist in the preprocessed DataFrame

In [None]:
X_train_preprocessed.info()

## Run classifier

First define helper factory functions for building pipelines:

In [None]:
def build_preparation_pipeline():
    return Pipeline([
        ('to_numpy', DataFrameToValuesTransformer()),
        ('scaler', StandardScaler())
    ])

def build_full_pipeline(classifier=None):
    preprocessing_pipeline = build_preprocessing_pipeline()
    preparation_pipeline = build_preparation_pipeline()
    return Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('preparation', preparation_pipeline),
        ('classifier', classifier)  # Expected to be filled by grid search
    ])

### Analyze feature importance
Train ``RandomForestClassifier`` and list ``feature_importances_``:

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100)
rf_pipeline = build_full_pipeline(classifier=rf_classifier)
rf_pipeline.fit(X_train, y_train)

feature_importances = rf_classifier.feature_importances_
feature_names = rf_pipeline.named_steps['preparation'].named_steps['to_numpy'].attributes_

feature_importances_with_names = [(feature_name, feature_importance) for feature_name, feature_importance in zip(feature_names, feature_importances)]

feature_importances_with_names.sort(key=lambda x: x[1], reverse=True)

N_MOST_IMPORTANT_TO_SHOW = 50
print("Feature importances (top {}):".format(N_MOST_IMPORTANT_TO_SHOW))
for feature_name, feature_importance in feature_importances_with_names[:N_MOST_IMPORTANT_TO_SHOW]:
    print("{} -> {}".format(feature_name, feature_importance))


## [Random forest classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [None]:
from sklearn.model_selection import cross_val_score

rf_classifier = RandomForestClassifier(n_estimators=100)
rf_pipeline = build_full_pipeline(classifier=rf_classifier)
cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring=make_scorer(cohen_kappa_score))

The performance on the test set is clearly quite bad. Let us do some grid search to see how much we can improve.
First define helper functions:

In [None]:
def build_grid_search(pipeline, param_grid):
    return GridSearchCV(pipeline, param_grid, cv=5, return_train_score=True, refit='cohen_kappa',
                        scoring={
                                    'accuracy': make_scorer(accuracy_score),
                                    'cohen_kappa': make_scorer(cohen_kappa_score)
                                },
                        verbose=2)

def pretty_cv_results(cv_results, 
                      sort_by='rank_test_cohen_kappa',
                      sort_ascending=True,
                      n_rows=5):
    df = pd.DataFrame(cv_results)
    cols_of_interest = [key for key in df.keys() if key.startswith('param_') 
                        or key.startswith('mean_train') 
                        or key.startswith('mean_test_')
                        or key.startswith('rank')]
    return df.loc[:, cols_of_interest].sort_values(by=sort_by, ascending=sort_ascending).head(n_rows)

def run_grid_search(grid_search):
    grid_search.fit(X_train, y_train)
    print('Best test score accuracy is:', grid_search.best_score_)
    return pretty_cv_results(grid_search.cv_results_)

Then run grid search for a given parameter grid:

In [None]:
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

param_grid = [
    {
        'classifier': [RandomForestClassifier(n_estimators=100)],
        'classifier__n_estimators': [30, 100, 300],
        'classifier__max_features': ['auto', 'log2', None]
    }
]

grid_search = build_grid_search(build_full_pipeline(), param_grid=param_grid)
run_grid_search(grid_search=grid_search)

### Investigate confusion matrix

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

best_estimator = grid_search.best_estimator_
y_pred = cross_val_predict(grid_search.best_estimator_, X=X_train, y=y_train, cv=5)

cnf_matrix = confusion_matrix(y_true=y_train, y_pred=y_pred)
print(cnf_matrix)

In [None]:
# From https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=range(0, 5),
                      title='Confusion matrix, without normalization')

## [Gaussian process classifier](https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html)

In [None]:
from sklearn.gaussian_process.kernels import RBF, Matern
from sklearn.gaussian_process import GaussianProcessClassifier

param_grid = [
    { 
        'classifier': [ GaussianProcessClassifier() ], 
        'classifier__kernel': [1.0*RBF(1.0), 1.0*Matern(1.0)]
    }
]

gp_grid_search = build_grid_search(pipeline=build_full_pipeline(), param_grid=param_grid)
gp_cv_results = run_grid_search(grid_search=gp_grid_search)

### [Gradient boosting](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = [
    { 
        'classifier': [ GradientBoostingClassifier(random_state=42) ],
        'classifier__loss': ['deviance'],
        'classifier__n_estimators': [50, 100, 500],
        'classifier__max_features': ['auto', None, 'log2'],
        'classifier__max_depth': [3, 5],
        'classifier__min_samples_leaf': [1],
        'classifier__min_samples_split': [2]
    }
]

gb_grid_search = build_grid_search(pipeline=build_full_pipeline(), param_grid=param_grid)
gb_cv_results = run_grid_search(grid_search=gb_grid_search)

## Submission

Create a voting classifier from the best estimators and check the generalization accuracy for heldout data X_val.

In [None]:
from sklearn.ensemble import VotingClassifier

voting_estimators = [
    ('rf', grid_search),
    # ('logistic', log_grid_search),  # TODO
    # ('svc', svm_grid_search),
    ('gp', gp_grid_search),
    # ('ada', ada_grid_search),
    ('gb', gb_grid_search),
]

estimators_with_names = [(name, grid_search.best_estimator_) for name, grid_search in voting_estimators]

voting_classifier = VotingClassifier(estimators=estimators_with_names, voting='soft')

voting_classifier.fit(X_train, y_train)

from sklearn.metrics import cohen_kappa_score

y_val_pred = voting_classifier.predict(X_val)

cohen_kappa_score(y_val, y_val_pred)

### Train voting classifier with all data available

In [None]:
voting_classifier.fit(X_train_val, y_train_val)

In [None]:
def get_predictions(estimator, X):
    predictions = estimator.predict(X)
    indices = X.loc[:, 'PetID']
    as_dict = [{'PetID': index, 'AdoptionSpeed': prediction} for index, prediction in zip(indices, predictions)]
    df = pd.DataFrame.from_dict(as_dict)
    df = df.reindex(['PetID', 'AdoptionSpeed'], axis=1)
    return df

predictions = get_predictions(voting_classifier, X=X_test)

In [None]:
def write_submission(predictions):
    submission_folder = '.'
    dest_file = os.path.join(submission_folder, 'submission.csv')
    predictions.to_csv(dest_file, index=False)
    print("Wrote to {}".format(dest_file))
    
write_submission(predictions)