### Initial Setup

In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [2]:
# Read in data
raw_data = pd.read_csv('train_data_engineered.csv')
test_data = pd.read_csv('test_data_engineered.csv')

In [3]:
outcome_dict = {'Euthanasia':0, 'Return_to_owner':1, 'Died':2, 'Adoption':3, 'Transfer':4}

In [4]:
raw_data.OutcomeType = raw_data.OutcomeType.apply(lambda x: outcome_dict[x])

In [None]:
# Examine the data a little bit
print raw_data.shape
print raw_data.head()

In [5]:
# Seperate out label and data
train_labels = raw_data.OutcomeType
train_data = raw_data.drop('OutcomeType', 1).drop('OutcomeSubtype', 1)

### EDA

In [None]:
data.Name.value_counts()[:20]

In [None]:
data['DateTime'].value_counts()[:20]

In [None]:
data.OutcomeSubtype.value_counts()

In [None]:
data.AnimalType.value_counts()

In [None]:
data.SexuponOutcome.value_counts()

In [None]:
data.AgeuponOutcome.value_counts()

In [None]:
data.Breed.value_counts()

In [None]:
data.Color.value_counts()

In [None]:
data['Age_num'] = data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)

In [None]:
data['AgeuponOutcome'].str[2:].str.strip().value_counts()

In [None]:
data['Age_str'] = data['AgeuponOutcome'].str[2:].str.strip()

In [None]:
#Create an age factor column based on each possible unit of measure
data['Age_factor'] = 0
data['Age_factor'][data['Age_str'] == 'years'] = 365
data['Age_factor'][data['Age_str'] == 'months'] = 30
data['Age_factor'][data['Age_str'] == 'year'] = 365
data['Age_factor'][data['Age_str'] == 'weeks'] = 7
data['Age_factor'][data['Age_str'] == 'month'] = 30
data['Age_factor'][data['Age_str'] == 'days'] = 1
data['Age_factor'][data['Age_str'] == 'week'] = 7
data['Age_factor'][data['Age_str'] == 'day'] = 1

In [None]:
data.head()

In [None]:
#Multiply the number in the age by the factor for comparable numerical column.  Drop and derivitive columns
data['Age_num'] = data['Age_num'].astype(float)
data['Age_days'] = data['Age_num'].mul(data['Age_factor'], axis='index')
data.drop('OutcomeAge', 1, inplace = True)
data.drop('Age_num', 1, inplace = True)
data.drop('Age_str', 1, inplace = True)
data.drop('Age_factor', 1, inplace = True)

#Split out sex data into fixed/neutered and gender
split_data = data['SexuponOutcome'].str.split().apply(pd.Series)
split_data.columns = ['fixed', 'gender']
data.merge(split_data, how='inner', left_index=True, right_index=True)

In [None]:
data.head()

### Data manipulation: Convert string to numeric categories


In [None]:
# Replace animal type to numeric: Dog = 1, Cat = 0
data.replace(to_replace='Dog', value=1, inplace=True)
data.replace(to_replace='Cat', value=0, inplace=True)

test_data.replace(to_replace='Dog', value=1, inplace=True)
test_data.replace(to_replace='Cat', value=0, inplace=True)


In [None]:
# Replace sex upon outcome: intact = 1, otherwise = 0, unknown = 99 (temp)
data.replace(to_replace='Neutered Male', value=0, inplace=True)
data.replace(to_replace='Spayed Female', value=0, inplace=True)
data.replace(to_replace='Intact Male', value=1, inplace=True)
data.replace(to_replace='Intact Female', value=1, inplace=True)
data.replace(to_replace='Unknown', value=99, inplace=True)
data.replace(to_replace='nan', value=99, inplace=True)

test_data.replace(to_replace='Neutered Male', value=0, inplace=True)
test_data.replace(to_replace='Spayed Female', value=0, inplace=True)
test_data.replace(to_replace='Intact Male', value=1, inplace=True)
test_data.replace(to_replace='Intact Female', value=1, inplace=True)
test_data.replace(to_replace='Unknown', value=99, inplace=True)
test_data.replace(to_replace='nan', value=99, inplace=True)


In [None]:
# Replace outcome label: Return to owner = 1, Adoption = 2, Euthanasia = 3, Transfer = 4, Died = 5
labels.replace(to_replace='Return_to_owner', value=1, inplace=True)
labels.replace(to_replace='Adoption', value=2, inplace=True)
labels.replace(to_replace='Euthanasia', value=3, inplace=True)
labels.replace(to_replace='Transfer', value=4, inplace=True)
labels.replace(to_replace='Died', value=5, inplace=True)

### Determine features that are applicable

In [None]:
# Determine a feature set
feature_names = ['AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color']
features = train_data.loc[:, feature_names]

In [None]:
# Just for test, to be deleted
sub_features = features.loc[:,['AnimalType','SexuponOutcome']]

### Apply machine learning techniques

In [6]:
# Splitting data into train set and development set
train_data, dev_data, train_labels, dev_labels = train_test_split(train_data, train_labels, test_size=0.3, random_state=0)

In [7]:
# Logictic regression
mdl = LogisticRegression()
mdl.fit(train_data, train_labels)
prediction = mdl.predict(dev_data)

# Prediction accuracy
print 'Prediction accuracy:', np.mean(prediction == dev_labels)

Prediction accuracy: 0.563661304402


In [8]:
pred_prob = mdl.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]]) for x in range(0, len(pred_prob))])

0.9875959429700355

In [None]:
# Format an output file
#out = np.zeros((prediction.shape[0], 5))

#for i in range(out.shape[0]):
#    out[i, prediction[i]] = 1

# Convert to panda dataframe
out = pd.DataFrame(data=prediction, index=test_data.ID, columns=['Return_to_owner','Adoption','Euthanasia','Transfer','Died'])

In [None]:
out.head()

In [None]:
out.to_csv('results.csv')

### SVM

In [9]:
from sklearn.svm import SVC

In [10]:
#Run the model
svm = SVC(probability=True)
svm.fit(train_data, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
svm.score(dev_data, dev_labels)

0.57089412644968196

In [None]:
def mc_log_loss(pred_prob, dev_labels):
    return -np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(len(pred_prob))])

In [12]:
pred_prob = svm.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

3.2275961847615164

In [None]:
dev_labels.iloc[0]

In [None]:
C_range

In [None]:
C_range = np.logspace(-2, 10, 1)
gamma_range = np.logspace(-9, 3, 1)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(probability=True), param_grid=param_grid, cv=5, scoring='neg_log_loss')
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

## Naive Bayes

In [13]:
from sklearn.naive_bayes import GaussianNB

In [14]:
GNB = GaussianNB()
GNB.fit(train_data, train_labels)
GNB.score(dev_data, dev_labels)

0.12033919441326849

In [15]:
pred_prob = GNB.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

15.564549261208979

In [None]:
#are there any parameters to really tweak for this one?

## Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
max_features = [1, 2, 'log2', None]

max_depth = [1, 2]
min_samples_split = [2, 3]
min_samples_leaf = range(1, 3)

param_grid = dict(max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf= \
                 min_samples_leaf)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
grid = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid=param_grid, cv=cv)
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [13]:
param_grid= {'max_features':['log2'], 'max_depth':[2], 'min_samples_leaf':[2], 'min_samples_split':[2]}
grid = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid=param_grid, cv=5, scoring='neg_log_loss')
#RF = RandomForestClassifier(n_jobs=-1, max_features='log2', min_samples_split=2, max_depth=2, min_samples_leaf=2)
grid.fit(train_data, train_labels)
#RF.score(dev_data, dev_labels)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': ['log2'], 'min_samples_split': [2], 'max_depth': [2], 'min_samples_leaf': [2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [17]:
RF = RandomForestClassifier(n_jobs=-1, max_features='log2', min_samples_split=2, max_depth=2, min_samples_leaf=2)
RF.fit(train_data, train_labels)
RF.score(dev_data, dev_labels)

0.52163611422870682

In [19]:
pred_prob = RF.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

2.8667026358939465

In [18]:
pred_prob = grid.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

NameError: name 'grid' is not defined

In [16]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'max_features': 'log2', 'min_samples_split': 2, 'max_depth': 2, 'min_samples_leaf': 2} with a score of -1.22


## KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
range(5000,5001, 100)

In [18]:
weights = ['uniform', 'distance']
n_neighbors = range(5000,5001, 100)
n_jobs = [-1]
p = [1, 2]

param_grid = dict(weights = weights, n_neighbors = n_neighbors, n_jobs=n_jobs, p = p)
#cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=5, scoring='neg_log_loss')
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

KeyboardInterrupt: 

In [21]:
KNN = KNeighborsClassifier(n_neighbors=1000 ,n_jobs=-1, p=1 ,weights='distance' )
KNN.fit(train_data, train_labels)
KNN.score(dev_data, dev_labels)

0.55742611298166855

In [22]:
pred_prob = KNN.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

3.8589037092812508

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#Realized we can only scale the age, the rest are categorical

### Normalizing age

In [None]:
data_normed = data.copy()
data_normed['Age_days'] = (data['Age_days'] - np.mean(data['Age_days']))/np.std(data['Age_days'])
train_data2, dev_data2, train_labels2, dev_labels2 = train_test_split(data_normed, labels, test_size=0.3, random_state=0)

In [None]:
#Random Forest Retest
RF.fit(train_data2, train_labels2)
RF.score(dev_data2, dev_labels2)

In [None]:
pred_prob = RF.predict_proba(dev_data2)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels2.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
#Naive Bayes Retest
GNB.fit(train_data2, train_labels2)
GNB.score(dev_data2, dev_labels2)

In [None]:
pred_prob = GNB.predict_proba(dev_data2)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels2.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
#SVM Retest
svm.fit(train_data2, train_labels2)
svm.score(dev_data2, dev_labels2)

In [None]:
pred_prob = svm.predict_proba(dev_data2)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels2.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
#K Nearest Neighbors Retest
KNN.fit(train_data2, train_labels2)
KNN.score(dev_data2, dev_labels2)

In [None]:
pred_prob = KNN.predict_proba(dev_data2)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels2.iloc[x]-1]) for x in range(0, len(pred_prob))])

### Popularity

In [None]:
data_pop = data.copy()
data_pop['Name_Pop'] = [np.sum(data_pop['Name'] == name) for name in data_pop['Name']]
data_pop.drop(['Name'], inplace=True, axis=1)
train_data3, dev_data3, train_labels3, dev_labels3 = train_test_split(data_pop, labels, test_size=0.3, random_state=0)

In [None]:
#Random Forest Retest
RF.fit(train_data3, train_labels3)
RF.score(dev_data3, dev_labels3)

In [None]:
pred_prob = RF.predict_proba(dev_data3)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels3.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
#Naive Bayes Retest
GNB.fit(train_data3, train_labels3)
GNB.score(dev_data3, dev_labels3)

In [None]:
pred_prob = GNB.predict_proba(dev_data3)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels3.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
#SVM Retest
svm.fit(train_data3, train_labels3)
svm.score(dev_data3, dev_labels3)

In [None]:
pred_prob = svm.predict_proba(dev_data3)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels3.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
#K Nearest Neighbors Retest
KNN.fit(train_data3, train_labels3)
KNN.score(dev_data3, dev_labels3)

In [None]:
pred_prob = KNN.predict_proba(dev_data3)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels3.iloc[x]-1]) for x in range(0, len(pred_prob))])

### Recent Outcomes

In [None]:
import datetime

In [None]:
data_freq = data.copy()
data_freq['DateTime'] = [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in raw_data['DateTime']]
data_freq['LastMonth'] = data_freq['DateTime'] - pd.Timedelta(days=30)

In [None]:
data_freq.dtypes

In [None]:
last_30 = []
for i in range(len(data_freq['DateTime'])):
    date = data_freq.loc[i, 'DateTime']
    date_lastmonth = date - pd.Timedelta(days=30)
    last_30.append(data_freq[(data_freq['DateTime'] <= date) & (data_freq['DateTime'] > date_lastmonth)].shape[0])

In [None]:
data_freq['last30'] = last_30

In [None]:
data_freq.head()

In [None]:
data_freq.drop(['DateTime', 'LastMonth'], inplace=True, axis=1)

In [None]:
train_data4, dev_data4, train_labels4, dev_labels4 = train_test_split(data_freq, labels, test_size=0.3, random_state=0)

In [None]:
#Random Forest Retest
RF.fit(train_data4, train_labels4)
RF.score(dev_data4, dev_labels4)

In [None]:
pred_prob = RF.predict_proba(dev_data4)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels4.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
#Naive Bayes Retest
GNB.fit(train_data4, train_labels4)
GNB.score(dev_data4, dev_labels4)

In [None]:
pred_prob = GNB.predict_proba(dev_data4)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels4.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
#SVM Retest
svm.fit(train_data4, train_labels4)
svm.score(dev_data4, dev_labels4)

In [None]:
pred_prob = svm.predict_proba(dev_data4)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels4.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
#K Nearest Neighbors Retest
KNN.fit(train_data4, train_labels4)
KNN.score(dev_data4, dev_labels4)

In [None]:
pred_prob = KNN.predict_proba(dev_data4)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels4.iloc[x]-1]) for x in range(0, len(pred_prob))])

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
GBC = GradientBoostingClassifier()
#loss = ['deviance', 'exponential']

n_estimators = np.arange(50,251,50)
#learning_rate = np.logspace(-4, 1, 5)
random_state = [42]
criterion = ['friedman_mse', 'mse', 'mae']


min_samples_split = np.arange(2, 5, 1)
#min_samples_leaf = np.arange(2, 4, 1)
max_depth = np.arange(1, 5, 1)

In [None]:
np.logspace(-4, 3, 5)

In [None]:
np.arange(50,251,50)

In [None]:
GBC.fit(train_data, train_labels)
GBC.score(dev_data, dev_labels)

In [None]:
pred_prob = GBC.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
GBC.fit(train_data2, train_labels2)
GBC.score(dev_data2, dev_labels2)

In [None]:
pred_prob = GBC.predict_proba(dev_data2)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels2.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
GBC.fit(train_data3, train_labels3)
GBC.score(dev_data3, dev_labels3)

In [None]:
pred_prob = GBC.predict_proba(dev_data3)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels3.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
GBC.fit(train_data4, train_labels4)
GBC.score(dev_data4, dev_labels4)

In [None]:
pred_prob = GBC.predict_proba(dev_data4)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels4.iloc[x]-1]) for x in range(0, len(pred_prob))])

In [None]:
param_grid = dict(learning_rate=[0.2],
                  #n_estimators=np.arange(20, 151, 10), 
                  n_estimators= [100],
                  max_depth = np.arange(5,15,3),  
                  subsample = [0.8],
                  min_samples_split = np.arange(200,1001,200), 
                  min_samples_leaf = [50],
                  random_state=[10]
                 )
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
grid = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, cv=5)
grid.fit(train_data, train_labels)

In [None]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [None]:
grid.grid_scores_, grid.best_params_, grid.best_score_

In [None]:
train_data.head()