### Initial Setup

In [4]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [152]:
# Read in data
raw_data = pd.read_csv('train_data_engineered.csv')
test_data = pd.read_csv('test_data_engineered.csv')

In [153]:
# Examine the data a little bit
print raw_data.shape
print raw_data.head()

(26729, 18)
  AnimalID  Name             DateTime  OutcomeType  OutcomeSubtype  \
0  A671945  2911  2014-02-12 18:22:00            4               0   
1  A656520  2266  2013-10-13 12:44:00            3              16   
2  A686464  5501  2015-01-31 12:28:00            1               7   
3  A683430     0  2014-07-11 19:09:00            5              13   
4  A667013     0  2013-11-15 12:52:00            5              13   

   AnimalType  Breed  Color  Age_days  Year  Month  Day  Hour  Minute    Mix  \
0           1   1482    146     365.0  2014      2   12    18      22   True   
1           0    775    184     365.0  2013     10   13    12      44   True   
2           1   1293     97     730.0  2015      1   31    12      28   True   
3           0    775     47      21.0  2014      7   11    19       9   True   
4           1   1101    311     730.0  2013     11   15    12      52  False   

   First_Breed  Second_Breed has_name  
0          181             0    False  
1     

In [154]:
# Seperate out label and data
labels = raw_data.OutcomeType
data = raw_data.drop('OutcomeType', 1).drop('OutcomeSubtype', 1).drop('AnimalID', 1).drop('DateTime', 1)

### EDA

In [None]:
data.Name.value_counts()[:20]

In [None]:
data['DateTime'].value_counts()[:20]

In [None]:
data.OutcomeSubtype.value_counts()

In [None]:
data.AnimalType.value_counts()

In [None]:
data.SexuponOutcome.value_counts()

In [None]:
data.AgeuponOutcome.value_counts()

In [None]:
data.Breed.value_counts()

In [None]:
data.Color.value_counts()

In [None]:
data['Age_num'] = data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)

In [None]:
data['AgeuponOutcome'].str[2:].str.strip().value_counts()

In [None]:
data['Age_str'] = data['AgeuponOutcome'].str[2:].str.strip()

In [None]:
#Create an age factor column based on each possible unit of measure
data['Age_factor'] = 0
data['Age_factor'][data['Age_str'] == 'years'] = 365
data['Age_factor'][data['Age_str'] == 'months'] = 30
data['Age_factor'][data['Age_str'] == 'year'] = 365
data['Age_factor'][data['Age_str'] == 'weeks'] = 7
data['Age_factor'][data['Age_str'] == 'month'] = 30
data['Age_factor'][data['Age_str'] == 'days'] = 1
data['Age_factor'][data['Age_str'] == 'week'] = 7
data['Age_factor'][data['Age_str'] == 'day'] = 1

In [None]:
data.head()

In [None]:
#Multiply the number in the age by the factor for comparable numerical column.  Drop and derivitive columns
data['Age_num'] = data['Age_num'].astype(float)
data['Age_days'] = data['Age_num'].mul(data['Age_factor'], axis='index')
data.drop('OutcomeAge', 1, inplace = True)
data.drop('Age_num', 1, inplace = True)
data.drop('Age_str', 1, inplace = True)
data.drop('Age_factor', 1, inplace = True)

#Split out sex data into fixed/neutered and gender
split_data = data['SexuponOutcome'].str.split().apply(pd.Series)
split_data.columns = ['fixed', 'gender']
data.merge(split_data, how='inner', left_index=True, right_index=True)

In [None]:
data.head()

### Data manipulation: Convert string to numeric categories


In [None]:
# Replace animal type to numeric: Dog = 1, Cat = 0
data.replace(to_replace='Dog', value=1, inplace=True)
data.replace(to_replace='Cat', value=0, inplace=True)

test_data.replace(to_replace='Dog', value=1, inplace=True)
test_data.replace(to_replace='Cat', value=0, inplace=True)


In [None]:
# Replace sex upon outcome: intact = 1, otherwise = 0, unknown = 99 (temp)
data.replace(to_replace='Neutered Male', value=0, inplace=True)
data.replace(to_replace='Spayed Female', value=0, inplace=True)
data.replace(to_replace='Intact Male', value=1, inplace=True)
data.replace(to_replace='Intact Female', value=1, inplace=True)
data.replace(to_replace='Unknown', value=99, inplace=True)
data.replace(to_replace='nan', value=99, inplace=True)

test_data.replace(to_replace='Neutered Male', value=0, inplace=True)
test_data.replace(to_replace='Spayed Female', value=0, inplace=True)
test_data.replace(to_replace='Intact Male', value=1, inplace=True)
test_data.replace(to_replace='Intact Female', value=1, inplace=True)
test_data.replace(to_replace='Unknown', value=99, inplace=True)
test_data.replace(to_replace='nan', value=99, inplace=True)


In [None]:
# Replace outcome label: Return to owner = 1, Adoption = 2, Euthanasia = 3, Transfer = 4, Died = 5
labels.replace(to_replace='Return_to_owner', value=1, inplace=True)
labels.replace(to_replace='Adoption', value=2, inplace=True)
labels.replace(to_replace='Euthanasia', value=3, inplace=True)
labels.replace(to_replace='Transfer', value=4, inplace=True)
labels.replace(to_replace='Died', value=5, inplace=True)

### Determine features that are applicable

In [None]:
# Determine a feature set
feature_names = ['AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color']
features = train_data.loc[:, feature_names]

In [None]:
# Just for test, to be deleted
sub_features = features.loc[:,['AnimalType','SexuponOutcome']]

### Apply machine learning techniques

In [10]:
# Splitting data into train set and development set
train_data, dev_data, train_labels, dev_labels = train_test_split(data, labels, test_size=0.3, random_state=0)

In [None]:
# Logictic regression
mdl = LogisticRegression()
mdl.fit(train_data, train_labels)
prediction = mdl.predict(dev_data)

# Prediction accuracy
print 'Prediction accuracy:', np.mean(prediction == dev_labels)

In [None]:
mdl.score(dev_data, dev_labels)

In [None]:
# Format an output file
#out = np.zeros((prediction.shape[0], 5))

#for i in range(out.shape[0]):
#    out[i, prediction[i]] = 1

# Convert to panda dataframe
out = pd.DataFrame(data=prediction, index=test_data.ID, columns=['Return_to_owner','Adoption','Euthanasia','Transfer','Died'])

In [None]:
out.head()

In [None]:
out.to_csv('results.csv')

### SVM

In [8]:
from sklearn.svm import SVC

In [15]:
#Run the model
svm = SVC(probability=True)
svm.fit(train_data, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
svm.score(dev_data, dev_labels)

0.46526998378850232

In [39]:
pred_prob = svm.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.1644216954071387

In [37]:
dev_labels.iloc[0]

4

In [10]:
C_range = np.logspace(-2, 10, 4)
gamma_range = np.logspace(-9, 3, 4)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(probability=True), param_grid=param_grid, cv=cv)
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

KeyboardInterrupt: 

## Naive Bayes

In [40]:
from sklearn.naive_bayes import GaussianNB

In [41]:
GNB = GaussianNB()
GNB.fit(train_data, train_labels)
GNB.score(dev_data, dev_labels)

0.51614914577877535

In [42]:
pred_prob = GNB.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.4706834349392806

In [None]:
#are there any parameters to really tweak for this one?

## Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
max_features = [1, 2, 'log2', None]

max_depth = [1, 2]
min_samples_split = [2, 3]
min_samples_leaf = range(1, 3)

param_grid = dict(max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf= \
                 min_samples_leaf)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
grid = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid=param_grid, cv=cv)
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'max_features': 'log2', 'min_samples_split': 2, 'max_depth': 2, 'min_samples_leaf': 2} with a score of 0.55


In [47]:
RF = RandomForestClassifier(n_jobs=-1, max_features='log2', min_samples_split=2, max_depth=2, min_samples_leaf=2)
RF.fit(train_data, train_labels)
RF.score(dev_data, dev_labels)

0.55268736750218228

In [48]:
pred_prob = RF.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.066369435316501

## KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier

In [74]:
range(5000,5001, 100)

[5000]

In [63]:
weights = ['uniform', 'distance']
n_neighbors = range(5000,5001, 100)
n_jobs = [-1]
p = [1, 2]

param_grid = dict(weights = weights, n_neighbors = n_neighbors, n_jobs=n_jobs, p = p)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=cv)
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'n_neighbors': 100, 'n_jobs': -1, 'weights': 'distance', 'p': 1} with a score of 0.57


In [87]:
KNN = KNeighborsClassifier(n_neighbors=1000 ,n_jobs=-1, p=1 ,weights='distance' )
KNN.fit(train_data, train_labels)
KNN.score(dev_data, dev_labels)

0.53410649706946001

In [88]:
pred_prob = KNN.predict_proba(dev_data)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.0237339413755753

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#Realized we can only scale the age, the rest are categorical

### Normalizing age

In [66]:
data_normed = data.copy()
data_normed['Age_days'] = (data['Age_days'] - np.mean(data['Age_days']))/np.std(data['Age_days'])
train_data2, dev_data2, train_labels2, dev_labels2 = train_test_split(data_normed, labels, test_size=0.3, random_state=0)

In [67]:
#Random Forest Retest
RF.fit(train_data2, train_labels2)
RF.score(dev_data2, dev_labels2)

0.54333458037161741

In [68]:
pred_prob = RF.predict_proba(dev_data2)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels2.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.0804074151884175

In [69]:
#Naive Bayes Retest
GNB.fit(train_data2, train_labels2)
GNB.score(dev_data2, dev_labels2)

0.51627384960718292

In [71]:
pred_prob = GNB.predict_proba(dev_data2)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels2.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.4686220480014389

In [72]:
#SVM Retest
svm.fit(train_data2, train_labels2)
svm.score(dev_data2, dev_labels2)

0.4939518643222347

In [73]:
pred_prob = svm.predict_proba(dev_data2)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels2.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.1493682215372991

In [89]:
#K Nearest Neighbors Retest
KNN.fit(train_data2, train_labels2)
KNN.score(dev_data2, dev_labels2)

0.51440329218106995

In [90]:
pred_prob = KNN.predict_proba(dev_data2)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels2.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.0906187721211043

### Popularity

In [118]:
data_pop = data.copy()
data_pop['Name_Pop'] = [np.sum(data_pop['Name'] == name) for name in data_pop['Name']]
data_pop.drop(['Name'], inplace=True, axis=1)
train_data3, dev_data3, train_labels3, dev_labels3 = train_test_split(data_pop, labels, test_size=0.3, random_state=0)

In [120]:
#Random Forest Retest
RF.fit(train_data3, train_labels3)
RF.score(dev_data3, dev_labels3)

0.5440828033420626

In [121]:
pred_prob = RF.predict_proba(dev_data3)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels3.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.064936331270371

In [182]:
#Naive Bayes Retest
GNB.fit(train_data3, train_labels3)
GNB.score(dev_data3, dev_labels3)

0.49070956478363886

In [183]:
pred_prob = GNB.predict_proba(dev_data3)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels3.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.6876313829157921

In [181]:
#SVM Retest
svm.fit(train_data3, train_labels3)
svm.score(dev_data3, dev_labels3)

KeyboardInterrupt: 

In [126]:
pred_prob = svm.predict_proba(dev_data3)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels3.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.1592314140596256

In [127]:
#K Nearest Neighbors Retest
KNN.fit(train_data3, train_labels3)
KNN.score(dev_data3, dev_labels3)

0.55568025938396304

In [128]:
pred_prob = KNN.predict_proba(dev_data3)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels3.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.3765845904244516

### Recent Outcomes

In [132]:
import datetime

In [140]:
data_freq = data.copy()
data_freq['DateTime'] = [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in raw_data['DateTime']]
data_freq['LastMonth'] = data_freq['DateTime'] - pd.Timedelta(days=30)

In [148]:
data_freq.dtypes

Name                     int64
AnimalType               int64
Breed                    int64
Color                    int64
Age_days               float64
Year                     int64
Month                    int64
Day                      int64
Hour                     int64
Minute                   int64
Mix                       bool
First_Breed              int64
Second_Breed             int64
has_name                  bool
DateTime        datetime64[ns]
LastMonth       datetime64[ns]
dtype: object

In [166]:
last_30 = []
for i in range(len(data_freq['DateTime'])):
    date = data_freq.loc[i, 'DateTime']
    date_lastmonth = date - pd.Timedelta(days=30)
    last_30.append(data_freq[(data_freq['DateTime'] <= date) & (data_freq['DateTime'] > date_lastmonth)].shape[0])

In [167]:
data_freq['last30'] = last_30

In [170]:
data_freq.head()

Unnamed: 0,Name,AnimalType,Breed,Color,Age_days,Year,Month,Day,Hour,Minute,Mix,First_Breed,Second_Breed,has_name,last30
0,2911,1,1482,146,365.0,2014,2,12,18,22,True,181,0,False,690
1,2266,0,775,184,365.0,2013,10,13,12,44,True,80,0,False,396
2,5501,1,1293,97,730.0,2015,1,31,12,28,True,159,0,False,731
3,0,0,775,47,21.0,2014,7,11,19,9,True,80,0,True,1139
4,0,1,1101,311,730.0,2013,11,15,12,52,False,127,87,True,839


In [169]:
data_freq.drop(['DateTime', 'LastMonth'], inplace=True, axis=1)

In [171]:
train_data4, dev_data4, train_labels4, dev_labels4 = train_test_split(data_freq, labels, test_size=0.3, random_state=0)

In [172]:
#Random Forest Retest
RF.fit(train_data4, train_labels4)
RF.score(dev_data4, dev_labels4)

0.52088789125826163

In [173]:
pred_prob = RF.predict_proba(dev_data4)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels4.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.0851915291386103

In [174]:
#Naive Bayes Retest
GNB.fit(train_data4, train_labels4)
GNB.score(dev_data4, dev_labels4)

0.51577503429355276

In [175]:
pred_prob = GNB.predict_proba(dev_data4)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels4.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.4741685914283771

In [176]:
#SVM Retest
svm.fit(train_data4, train_labels4)
svm.score(dev_data4, dev_labels4)

0.43771043771043772

In [177]:
pred_prob = svm.predict_proba(dev_data4)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels4.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.2040019988047443

In [184]:
#K Nearest Neighbors Retest
KNN.fit(train_data4, train_labels4)
KNN.score(dev_data4, dev_labels4)

0.53011597456041903

In [185]:
pred_prob = KNN.predict_proba(dev_data4)
pred_prob = pred_prob + 0.0000000001
-np.mean([np.log(pred_prob[x][dev_labels4.iloc[x]-1]) for x in range(0, len(pred_prob))])

1.0374284353942389