### Initial Setup

In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [2]:
# Read in data
raw_data = pd.read_csv('train_data_engineered.csv')
test_data = pd.read_csv('test_data_engineered.csv')

In [3]:
# Examine the data a little bit
print raw_data.shape
print raw_data.head()

(26729, 18)
  AnimalID  Name             DateTime  OutcomeType  OutcomeSubtype  \
0  A671945  2911  2014-02-12 18:22:00            4               0   
1  A656520  2266  2013-10-13 12:44:00            3              16   
2  A686464  5501  2015-01-31 12:28:00            1               7   
3  A683430     0  2014-07-11 19:09:00            5              13   
4  A667013     0  2013-11-15 12:52:00            5              13   

   AnimalType  Breed  Color  Age_days  Year  Month  Day  Hour  Minute    Mix  \
0           1   1482    146     365.0  2014      2   12    18      22   True   
1           0    775    184     365.0  2013     10   13    12      44   True   
2           1   1293     97     730.0  2015      1   31    12      28   True   
3           0    775     47      21.0  2014      7   11    19       9   True   
4           1   1101    311     730.0  2013     11   15    12      52  False   

   First_Breed  Second_Breed has_name  
0          181             0    False  
1     

In [4]:
# Seperate out label and data
labels = raw_data.OutcomeType
data = raw_data.drop('OutcomeType', 1).drop('OutcomeSubtype', 1).drop('AnimalID', 1).drop('DateTime', 1)

### EDA

In [None]:
data.Name.value_counts()[:20]

In [None]:
data['DateTime'].value_counts()[:20]

In [None]:
data.OutcomeSubtype.value_counts()

In [None]:
data.AnimalType.value_counts()

In [None]:
data.SexuponOutcome.value_counts()

In [None]:
data.AgeuponOutcome.value_counts()

In [None]:
data.Breed.value_counts()

In [None]:
data.Color.value_counts()

In [None]:
data['Age_num'] = data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)

In [None]:
data['AgeuponOutcome'].str[2:].str.strip().value_counts()

In [None]:
data['Age_str'] = data['AgeuponOutcome'].str[2:].str.strip()

In [None]:
#Create an age factor column based on each possible unit of measure
data['Age_factor'] = 0
data['Age_factor'][data['Age_str'] == 'years'] = 365
data['Age_factor'][data['Age_str'] == 'months'] = 30
data['Age_factor'][data['Age_str'] == 'year'] = 365
data['Age_factor'][data['Age_str'] == 'weeks'] = 7
data['Age_factor'][data['Age_str'] == 'month'] = 30
data['Age_factor'][data['Age_str'] == 'days'] = 1
data['Age_factor'][data['Age_str'] == 'week'] = 7
data['Age_factor'][data['Age_str'] == 'day'] = 1

In [None]:
data.head()

In [None]:
#Multiply the number in the age by the factor for comparable numerical column.  Drop and derivitive columns
data['Age_num'] = data['Age_num'].astype(float)
data['Age_days'] = data['Age_num'].mul(data['Age_factor'], axis='index')
data.drop('OutcomeAge', 1, inplace = True)
data.drop('Age_num', 1, inplace = True)
data.drop('Age_str', 1, inplace = True)
data.drop('Age_factor', 1, inplace = True)

#Split out sex data into fixed/neutered and gender
split_data = data['SexuponOutcome'].str.split().apply(pd.Series)
split_data.columns = ['fixed', 'gender']
data.merge(split_data, how='inner', left_index=True, right_index=True)

In [None]:
data.head()

### Data manipulation: Convert string to numeric categories


In [None]:
# Replace animal type to numeric: Dog = 1, Cat = 0
data.replace(to_replace='Dog', value=1, inplace=True)
data.replace(to_replace='Cat', value=0, inplace=True)

test_data.replace(to_replace='Dog', value=1, inplace=True)
test_data.replace(to_replace='Cat', value=0, inplace=True)


In [None]:
# Replace sex upon outcome: intact = 1, otherwise = 0, unknown = 99 (temp)
data.replace(to_replace='Neutered Male', value=0, inplace=True)
data.replace(to_replace='Spayed Female', value=0, inplace=True)
data.replace(to_replace='Intact Male', value=1, inplace=True)
data.replace(to_replace='Intact Female', value=1, inplace=True)
data.replace(to_replace='Unknown', value=99, inplace=True)
data.replace(to_replace='nan', value=99, inplace=True)

test_data.replace(to_replace='Neutered Male', value=0, inplace=True)
test_data.replace(to_replace='Spayed Female', value=0, inplace=True)
test_data.replace(to_replace='Intact Male', value=1, inplace=True)
test_data.replace(to_replace='Intact Female', value=1, inplace=True)
test_data.replace(to_replace='Unknown', value=99, inplace=True)
test_data.replace(to_replace='nan', value=99, inplace=True)


In [None]:
# Replace outcome label: Return to owner = 1, Adoption = 2, Euthanasia = 3, Transfer = 4, Died = 5
labels.replace(to_replace='Return_to_owner', value=1, inplace=True)
labels.replace(to_replace='Adoption', value=2, inplace=True)
labels.replace(to_replace='Euthanasia', value=3, inplace=True)
labels.replace(to_replace='Transfer', value=4, inplace=True)
labels.replace(to_replace='Died', value=5, inplace=True)

### Determine features that are applicable

In [None]:
# Determine a feature set
feature_names = ['AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color']
features = train_data.loc[:, feature_names]

In [None]:
# Just for test, to be deleted
sub_features = features.loc[:,['AnimalType','SexuponOutcome']]

### Apply machine learning techniques

In [7]:
# Splitting data into train set and development set
train_data, dev_data, train_labels, dev_labels = train_test_split(data, labels, test_size=0.3, random_state=0)

In [None]:
# Logictic regression
mdl = LogisticRegression()
mdl.fit(train_data, train_labels)
prediction = mdl.predict(dev_data)

# Prediction accuracy
print 'Prediction accuracy:', np.mean(prediction == dev_labels)

In [None]:
mdl.score(dev_data, dev_labels)

In [None]:
# Format an output file
#out = np.zeros((prediction.shape[0], 5))

#for i in range(out.shape[0]):
#    out[i, prediction[i]] = 1

# Convert to panda dataframe
out = pd.DataFrame(data=prediction, index=test_data.ID, columns=['Return_to_owner','Adoption','Euthanasia','Transfer','Died'])

In [None]:
out.head()

In [None]:
out.to_csv('results.csv')

### SVM

In [5]:
from sklearn.svm import SVC

In [8]:
#Run the model
svm = SVC()
svm.fit(train_data, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
svm.score(dev_data, dev_labels)

0.46526998378850232

In [10]:
C_range = np.logspace(-2, 10, 4)
gamma_range = np.logspace(-9, 3, 4)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

KeyboardInterrupt: 

## Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

In [12]:
GNB = GaussianNB()
GNB.fit(train_data, train_labels)
GNB.score(dev_data, dev_labels)

0.51614914577877535

In [None]:
#are there any parameters to really tweak for this one?

## Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
max_features = [1, 2, 'log2', None]

max_depth = [1, 2]
min_samples_split = [2, 3]
min_samples_leaf = range(1, 3)

param_grid = dict(max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf= \
                 min_samples_leaf)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
grid = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid=param_grid, cv=cv)
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'max_features': 2, 'min_samples_split': 3, 'max_depth': 2, 'min_samples_leaf': 1} with a score of 0.56


## KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
test_data.shape

(11456, 16)

In [None]:
weights = ['uniform', 'distance']
n_neighbors = range(50,test_data.shape[0], 500)
n_jobs = [-1]
p = [1, 2]

param_grid = dict(weights = weights, n_neighbors = n_neighbors, n_jobs=n_jobs, p = p)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=cv)
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#Realized we can only scale the age, the rest are categorical