# Data Dictionary

PetID - Unique hash ID of pet profile

AdoptionSpeed - Categorical speed of adoption. Lower is faster. This is the value to predict. See below section for more info.

Type - Type of animal (1 = Dog, 2 = Cat)

Name - Name of pet (Empty if not named)

Age - Age of pet when listed, in months

Breed1 - Primary breed of pet (Refer to BreedLabels dictionary)

Breed2 - Secondary breed of pet, if pet is of mixed breed (Refer to BreedLabels dictionary)

Gender - Gender of pet (1 = Male, 2 = Female, 3 = Mixed, if profile represents group of pets)

Color1 - Color 1 of pet (Refer to ColorLabels dictionary)

Color2 - Color 2 of pet (Refer to ColorLabels dictionary)

Color3 - Color 3 of pet (Refer to ColorLabels dictionary)

MaturitySize - Size at maturity (1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified)

FurLength - Fur length (1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified)

Vaccinated - Pet has been vaccinated (1 = Yes, 2 = No, 3 = Not Sure)

Dewormed - Pet has been dewormed (1 = Yes, 2 = No, 3 = Not Sure)

Sterilized - Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)

Health - Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)

Quantity - Number of pets represented in profile

Fee - Adoption fee (0 = Free)

State - State location in Malaysia (Refer to StateLabels dictionary)

RescuerID - Unique hash ID of rescuer

VideoAmt - Total uploaded videos for this pet

PhotoAmt - Total uploaded photos for this pet

Description - Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese.

AdoptionSpeed

Contestants are required to predict this value. The value is determined by how quickly, if at all, a pet is adopted. The values are determined in the following way: 0 - Pet was adopted on the same day as it was listed. 1 - Pet was adopted between 1 and 7 days (1st week) after being listed. 2 - Pet was adopted between 8 and 30 days (1st month) after being listed. 3 - Pet was adopted between 31 and 90 days (2nd & 3rd month) after being listed. 4 - No adoption after 100 days of being listed. (There are no pets in this dataset that waited between 90 and 100 days).


# Import Libraries

In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

sns.set_style('whitegrid')

# Problem definition

Apply classification models to predict customers default payments

# Load the data

In [2]:
#source: https://www.kaggle.com/c/cebd-1260-spring-2019-classification/data
df_train = pd.read_csv('train.csv')
df_test_true = pd.read_csv('test.csv')

df_state_lbl = pd.read_csv('state_labels.csv')
df_breed_lbl = pd.read_csv('breed_labels.csv')
df_color_lbl = pd.read_csv('color_labels.csv')

#Ref.: https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
# Create Gender dataframe
# gender = {
#         'GenderID': ['1', '2', '3'],
#         'Gender': ['Male', 'Female', 'Mixed']} 
# df_gender = pd.DataFrame(gender, columns = ['GenderID', 'Gender'])



In [3]:
print(df_state_lbl.columns)
print('')
print(df_breed_lbl.columns)
print('')
print(df_color_lbl.columns)
print('')
# print(df_gender.columns)

Index(['StateID', 'StateName'], dtype='object')

Index(['BreedID', 'Type', 'BreedName'], dtype='object')

Index(['ColorID', 'ColorName'], dtype='object')



In [4]:
# df_breed_lbl_1 = df_breed_lbl.copy()
# df_breed_lbl_2 = df_breed_lbl.copy()

# df_color_lbl_1 = df_color_lbl.copy()
# df_color_lbl_2 = df_color_lbl.copy()
# df_color_lbl_3 = df_color_lbl.copy()

In [5]:
df = df_train.copy()
print(df.columns)
df.T

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
Type,1,2,2,1,2,1,1,1,2,1,...,1,1,2,2,2,1,1,2,1,2
Name,â¥â¥â¥ Lily â¥â¥â¥,Cookie,Favour Speedy Abundance And Courage,,Abandoned Kitty,Duke,Lila,Doggie2_Selangor Area,,Brother,...,Amos,Minnie,Meow Zai,SOS ð Owner Leaving Msia,Levi,Dawn,,Isabella,Brisco,
Age,36,3,7,3,1,3,2,8,1,6,...,3,1,2,24,6,1,5,2,1,1
Breed1,307,266,250,307,266,218,307,307,243,307,...,239,307,299,266,285,307,307,265,307,266
Breed2,0,0,252,0,0,0,0,0,245,0,...,307,0,0,0,265,0,0,0,0,0
Gender,2,1,1,1,1,1,2,2,2,1,...,1,2,1,2,1,2,2,2,1,2
Color1,2,6,1,2,1,3,1,6,1,2,...,1,1,4,2,6,2,5,3,2,1
Color2,7,7,2,0,6,5,7,0,2,7,...,2,2,6,3,7,5,0,6,7,2
Color3,0,0,0,0,7,0,0,0,7,0,...,0,0,0,7,0,7,0,7,0,0
MaturitySize,2,2,2,3,1,2,1,2,1,1,...,2,2,2,2,2,2,2,2,2,1


In [6]:
# check for NaNs
df.isnull().sum(axis = 0)

Type               0
Name             842
Age                0
Breed1             0
Breed2             0
Gender             0
Color1             0
Color2             0
Color3             0
MaturitySize       0
FurLength          0
Vaccinated         0
Dewormed           0
Sterilized         0
Health             0
Quantity           0
Fee                0
State              0
RescuerID          0
VideoAmt           0
Description        8
PetID              0
PhotoAmt           0
AdoptionSpeed      0
dtype: int64

# Feature Engineering 

#### Remove columns 

In [7]:
# TODO: remove a confusing column
del df['PetID']
del df['Description']
del df['RescuerID']
del df['Name']

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
Type,1.0,2.0,2.0,1.0,2.0
Age,36.0,3.0,7.0,3.0,1.0
Breed1,307.0,266.0,250.0,307.0,266.0
Breed2,0.0,0.0,252.0,0.0,0.0
Gender,2.0,1.0,1.0,1.0,1.0
Color1,2.0,6.0,1.0,2.0,1.0
Color2,7.0,7.0,2.0,0.0,6.0
Color3,0.0,0.0,0.0,0.0,7.0
MaturitySize,2.0,2.0,2.0,3.0,1.0
FurLength,2.0,1.0,1.0,1.0,1.0


In [9]:
print(df.columns)

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt',
       'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')


#### Rename columns 

In [10]:
df.columns = ['Type', 'Age', 'Breed1', 'Breed2', 'GenderID', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'StateID', 'VideoAmt',
       'PhotoAmt', 'AdoptionSpeed']

#### Join Dataframes

In [11]:
# Ref.: https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/

# df = pd.merge(df, df_state_lbl, on='StateID', how='outer')
# del df['StateID']

# df = pd.merge(df, df_gender, left_on='GenderID', right_on='GenderID')
# del df['GenderID']
# print(df.head().T)
# print(df.isnull().sum(axis = 0))

#### Get dummies 

In [12]:
# Create a loop to transform the categorical columns to numerical
# Ref.: https://github.com/arybressane/CEBD1260-BIG-DATA-ANALYTICS/blob/master/week6/classification-credit-card-sklearn-extended.ipynb

for col in [
    'Type', 'GenderID', 'MaturitySize','FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health','AdoptionSpeed']:
    
    df_dummies = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, df_dummies], axis=1)
    # Remove the original columns
    del df[col]
    
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
Age,36.0,3.0,7.0,3.0,1.0,3.0,2.0,8.0,1.0,6.0,...,3.0,1.0,2.0,24.0,6.0,1.0,5.0,2.0,1.0,1.0
Breed1,307.0,266.0,250.0,307.0,266.0,218.0,307.0,307.0,243.0,307.0,...,239.0,307.0,299.0,266.0,285.0,307.0,307.0,265.0,307.0,266.0
Breed2,0.0,0.0,252.0,0.0,0.0,0.0,0.0,0.0,245.0,0.0,...,307.0,0.0,0.0,0.0,265.0,0.0,0.0,0.0,0.0,0.0
Color1,2.0,6.0,1.0,2.0,1.0,3.0,1.0,6.0,1.0,2.0,...,1.0,1.0,4.0,2.0,6.0,2.0,5.0,3.0,2.0,1.0
Color2,7.0,7.0,2.0,0.0,6.0,5.0,7.0,0.0,2.0,7.0,...,2.0,2.0,6.0,3.0,7.0,5.0,0.0,6.0,7.0,2.0
Color3,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,7.0,0.0,7.0,0.0,7.0,0.0,0.0
Quantity,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0
Fee,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0
StateID,41326.0,41327.0,41327.0,41327.0,41401.0,41326.0,41326.0,41336.0,41326.0,41327.0,...,41326.0,41326.0,41401.0,41326.0,41326.0,41326.0,41330.0,41401.0,41326.0,41326.0
VideoAmt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df.columns

Index(['Age', 'Breed1', 'Breed2', 'Color1', 'Color2', 'Color3', 'Quantity',
       'Fee', 'StateID', 'VideoAmt', 'PhotoAmt', 'Type_1', 'Type_2',
       'GenderID_1', 'GenderID_2', 'GenderID_3', 'MaturitySize_1',
       'MaturitySize_2', 'MaturitySize_3', 'MaturitySize_4', 'FurLength_1',
       'FurLength_2', 'FurLength_3', 'Vaccinated_1', 'Vaccinated_2',
       'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3',
       'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health_1', 'Health_2',
       'Health_3', 'AdoptionSpeed_0', 'AdoptionSpeed_1', 'AdoptionSpeed_2',
       'AdoptionSpeed_3', 'AdoptionSpeed_4'],
      dtype='object')

In [14]:
# feature engineering
X_columns = ['Age', 'Breed1', 'Breed2', 'Color1', 'Color2', 'Color3', 'Quantity',
       'Fee', 'StateID', 'VideoAmt', 'PhotoAmt', 'Type_1', 'Type_2',
       'GenderID_1', 'GenderID_2', 'GenderID_3', 'MaturitySize_1',
       'MaturitySize_2', 'MaturitySize_3', 'MaturitySize_4', 'FurLength_1',
       'FurLength_2', 'FurLength_3', 'Vaccinated_1', 'Vaccinated_2',
       'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3',
       'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health_1', 'Health_2',
       'Health_3']
y_column = ['AdoptionSpeed_0', 'AdoptionSpeed_1', 'AdoptionSpeed_2',
       'AdoptionSpeed_3', 'AdoptionSpeed_4']

In [15]:
# list(X_columns)

In [16]:
# Ref.: https://stackoverflow.com/questions/43956335/convert-float64-column-to-int64-in-pandas
# df['PhotoAmt'] = df['PhotoAmt'].astype(np.int64)
# df.dtypes

# Model Training

In [17]:
# split the data

threshold = 0.8
X = df[X_columns]
y = df[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (8000, 35)
y_train (8000, 5)
X_test (2000, 35)
y_test (2000, 5)


# Model Training / Evaluation - Using Split

In [18]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier())    
]
results = []
for m in models:
    print('MODEL', m[0])
    model = m[1]
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    results.append([m[0], precision, recall])
    
    # if there is a feature importance, print top 5
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
        
    print('')

# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by='precision', ascending=False)
df_results

MODEL Naive Bayes


ValueError: Found input variables with inconsistent numbers of samples: [8000, 40000]

# Model Training / Evaluation - Cross Validation

In [None]:
k = 10
results = {}
for m in models:
    print('MODEL', m[0])
    results[m[0]] = {'precision':[], 'recall':[]}
    kf = KFold(n_splits=k)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]
        model = m[1]
        model.fit(X_train, y_train.ravel())
        y_pred = model.predict(X_test)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        results[m[0]]['precision'].append(precision)
        results[m[0]]['recall'].append(recall)
for metric in ['precision', 'recall']:
    values = []
    labels = []
    for model, result_values in results.items():
        for m, v in result_values.items():
            if m == metric:
                labels.append(model)
                values.append(v)
    plt.figure(figsize=(12,6))
    plt.title(metric)
    plt.boxplot(values)
    plt.xticks(range(1, len(labels)+1), labels, rotation='horizontal')
    plt.show()

# Tuning the Thresholds

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
for i in range(1,10):
    print(i)
    y_pred = model.predict_proba(X_test)[:,1]
    y_pred = [1 if x > i/10.0 else 0 for x in y_pred]
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)

# Prepare submission

In [None]:
df_prediction = df_test[X_columns].fillna(0.0)
df_test['AdoptionSpeed'] = model.predict(df_prediction)
df_test[['PetID', 'AdoptionSpeed']]



In [None]:
df_test[
    ['PetID', 'AdoptionSpeed']].to_csv('submission_classification.csv', index=False)