### Initial Setup

In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression

In [2]:
# Read in data
raw_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Examine the data a little bit
print raw_data.shape
print raw_data.head()

(26729, 10)
  AnimalID     Name             DateTime      OutcomeType OutcomeSubtype  \
0  A671945  Hambone  2014-02-12 18:22:00  Return_to_owner            NaN   
1  A656520    Emily  2013-10-13 12:44:00       Euthanasia      Suffering   
2  A686464   Pearce  2015-01-31 12:28:00         Adoption         Foster   
3  A683430      NaN  2014-07-11 19:09:00         Transfer        Partner   
4  A667013      NaN  2013-11-15 12:52:00         Transfer        Partner   

  AnimalType SexuponOutcome AgeuponOutcome                        Breed  \
0        Dog  Neutered Male         1 year        Shetland Sheepdog Mix   
1        Cat  Spayed Female         1 year       Domestic Shorthair Mix   
2        Dog  Neutered Male        2 years                 Pit Bull Mix   
3        Cat    Intact Male        3 weeks       Domestic Shorthair Mix   
4        Dog  Neutered Male        2 years  Lhasa Apso/Miniature Poodle   

         Color  
0  Brown/White  
1  Cream Tabby  
2   Blue/White  
3   Blue Cre

In [10]:
# Seperate out label and data
labels = raw_data.OutcomeType
data = raw_data.drop('OutcomeType', 1)

### Data manipulation: Convert string to numeric categories


In [11]:
# Replace animal type to numeric: Dog = 1, Cat = 0
data.replace(to_replace='Dog', value=1, inplace=True)
data.replace(to_replace='Cat', value=0, inplace=True)

test_data.replace(to_replace='Dog', value=1, inplace=True)
test_data.replace(to_replace='Cat', value=0, inplace=True)


In [12]:
# Replace sex upon outcome: intact = 1, otherwise = 0, unknown = 99 (temp)
data.replace(to_replace='Neutered Male', value=0, inplace=True)
data.replace(to_replace='Spayed Female', value=0, inplace=True)
data.replace(to_replace='Intact Male', value=1, inplace=True)
data.replace(to_replace='Intact Female', value=1, inplace=True)
data.replace(to_replace='Unknown', value=99, inplace=True)
data.replace(to_replace='nan', value=99, inplace=True)

test_data.replace(to_replace='Neutered Male', value=0, inplace=True)
test_data.replace(to_replace='Spayed Female', value=0, inplace=True)
test_data.replace(to_replace='Intact Male', value=1, inplace=True)
test_data.replace(to_replace='Intact Female', value=1, inplace=True)
test_data.replace(to_replace='Unknown', value=99, inplace=True)
test_data.replace(to_replace='nan', value=99, inplace=True)


In [13]:
# Replace outcome label: Return to owner = 1, Adoption = 2, Euthanasia = 3, Transfer = 4, Died = 5
labels.replace(to_replace='Return_to_owner', value=1, inplace=True)
labels.replace(to_replace='Adoption', value=2, inplace=True)
labels.replace(to_replace='Euthanasia', value=3, inplace=True)
labels.replace(to_replace='Transfer', value=4, inplace=True)
labels.replace(to_replace='Died', value=5, inplace=True)

In [14]:
# Splitting data into train set and development set
train_data, dev_data, train_labels, dev_labels = train_test_split(data, labels, test_size=0.3, random_state=0)

### Determine features that are applicable

In [15]:
# Determine a feature set
feature_names = ['AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color']
features = train_data.loc[:, feature_names]

In [16]:
# Just for test, to be deleted
sub_features = features.loc[:,['AnimalType','SexuponOutcome']]

### Apply machine learning techniques

In [17]:
# Logictic regression
mdl = LogisticRegression()
mdl.fit(sub_features, train_labels)
prediction = mdl.predict(dev_data.loc[:,['AnimalType','SexuponOutcome']])

# Prediction accuracy
print 'Prediction accuracy:', np.mean(prediction == dev_labels)

Prediction accuracy: 0.573512906846


In [18]:
# Make predictions of test data
prediction = mdl.predict_proba(test_data.loc[:,['AnimalType','SexuponOutcome']])

In [19]:
prediction

array([[ 0.43788028,  0.0601568 ,  0.08797519,  0.40816452,  0.0058232 ],
       [ 0.25107766,  0.46956622,  0.04946202,  0.22662606,  0.00326805],
       [ 0.04058345,  0.53280522,  0.04787469,  0.36779416,  0.01094248],
       ..., 
       [ 0.07589957,  0.10143432,  0.09171219,  0.70995469,  0.02099922],
       [ 0.25107766,  0.46956622,  0.04946202,  0.22662606,  0.00326805],
       [ 0.43788028,  0.0601568 ,  0.08797519,  0.40816452,  0.0058232 ]])

In [20]:
# Format an output file
#out = np.zeros((prediction.shape[0], 5))

#for i in range(out.shape[0]):
#    out[i, prediction[i]] = 1

# Convert to panda dataframe
out = pd.DataFrame(data=prediction, index=test_data.ID, columns=['Return_to_owner','Adoption','Euthanasia','Transfer','Died'])

In [21]:
out.head()

Unnamed: 0_level_0,Return_to_owner,Adoption,Euthanasia,Transfer,Died
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.43788,0.060157,0.087975,0.408165,0.005823
2,0.251078,0.469566,0.049462,0.226626,0.003268
3,0.040583,0.532805,0.047875,0.367794,0.010942
4,0.43788,0.060157,0.087975,0.408165,0.005823
5,0.251078,0.469566,0.049462,0.226626,0.003268


In [115]:
out.to_csv('results.csv')