### Initial Setup

In [42]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [43]:
# Read in data
data = pd.read_csv('train_data_engineered.csv')
test_data = pd.read_csv('test_data_engineered.csv')

In [None]:
# Examine the data a little bit
print raw_data.shape
print raw_data.head()

In [46]:
# Seperate out label and data
labels = raw_data.OutcomeType
data = raw_data.drop('OutcomeType', 1).drop('OutcomeSubtype', 1).drop('AnimalID', 1).drop('DateTime', 1)

### EDA

In [None]:
data.Name.value_counts()[:20]

In [None]:
data['DateTime'].value_counts()[:20]

In [None]:
data.OutcomeSubtype.value_counts()

In [None]:
data.AnimalType.value_counts()

In [None]:
data.SexuponOutcome.value_counts()

In [None]:
data.AgeuponOutcome.value_counts()

In [None]:
data.Breed.value_counts()

In [None]:
data.Color.value_counts()

In [None]:
data['Age_num'] = data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)

In [None]:
data['AgeuponOutcome'].str[2:].str.strip().value_counts()

In [None]:
data['Age_str'] = data['AgeuponOutcome'].str[2:].str.strip()

In [None]:
#Create an age factor column based on each possible unit of measure
data['Age_factor'] = 0
data['Age_factor'][data['Age_str'] == 'years'] = 365
data['Age_factor'][data['Age_str'] == 'months'] = 30
data['Age_factor'][data['Age_str'] == 'year'] = 365
data['Age_factor'][data['Age_str'] == 'weeks'] = 7
data['Age_factor'][data['Age_str'] == 'month'] = 30
data['Age_factor'][data['Age_str'] == 'days'] = 1
data['Age_factor'][data['Age_str'] == 'week'] = 7
data['Age_factor'][data['Age_str'] == 'day'] = 1

In [None]:
data.head()

In [None]:
#Multiply the number in the age by the factor for comparable numerical column.  Drop and derivitive columns
data['Age_num'] = data['Age_num'].astype(float)
data['Age_days'] = data['Age_num'].mul(data['Age_factor'], axis='index')
data.drop('OutcomeAge', 1, inplace = True)
data.drop('Age_num', 1, inplace = True)
data.drop('Age_str', 1, inplace = True)
data.drop('Age_factor', 1, inplace = True)

#Split out sex data into fixed/neutered and gender
split_data = data['SexuponOutcome'].str.split().apply(pd.Series)
split_data.columns = ['fixed', 'gender']
data.merge(split_data, how='inner', left_index=True, right_index=True)

In [None]:
data.head()

### Data manipulation: Convert string to numeric categories


In [None]:
# Replace animal type to numeric: Dog = 1, Cat = 0
data.replace(to_replace='Dog', value=1, inplace=True)
data.replace(to_replace='Cat', value=0, inplace=True)

test_data.replace(to_replace='Dog', value=1, inplace=True)
test_data.replace(to_replace='Cat', value=0, inplace=True)


In [None]:
# Replace sex upon outcome: intact = 1, otherwise = 0, unknown = 99 (temp)
data.replace(to_replace='Neutered Male', value=0, inplace=True)
data.replace(to_replace='Spayed Female', value=0, inplace=True)
data.replace(to_replace='Intact Male', value=1, inplace=True)
data.replace(to_replace='Intact Female', value=1, inplace=True)
data.replace(to_replace='Unknown', value=99, inplace=True)
data.replace(to_replace='nan', value=99, inplace=True)

test_data.replace(to_replace='Neutered Male', value=0, inplace=True)
test_data.replace(to_replace='Spayed Female', value=0, inplace=True)
test_data.replace(to_replace='Intact Male', value=1, inplace=True)
test_data.replace(to_replace='Intact Female', value=1, inplace=True)
test_data.replace(to_replace='Unknown', value=99, inplace=True)
test_data.replace(to_replace='nan', value=99, inplace=True)


In [None]:
# Replace outcome label: Return to owner = 1, Adoption = 2, Euthanasia = 3, Transfer = 4, Died = 5
labels.replace(to_replace='Return_to_owner', value=1, inplace=True)
labels.replace(to_replace='Adoption', value=2, inplace=True)
labels.replace(to_replace='Euthanasia', value=3, inplace=True)
labels.replace(to_replace='Transfer', value=4, inplace=True)
labels.replace(to_replace='Died', value=5, inplace=True)

In [45]:
# Splitting data into train set and development set
train_data, dev_data, train_labels, dev_labels = train_test_split(data, labels, test_size=0.3, random_state=0)

### Determine features that are applicable

In [None]:
# Determine a feature set
feature_names = ['AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color']
features = train_data.loc[:, feature_names]

In [None]:
# Just for test, to be deleted
sub_features = features.loc[:,['AnimalType','SexuponOutcome']]

### Apply machine learning techniques

In [None]:
# Logictic regression
mdl = LogisticRegression()
mdl.fit(sub_features, train_labels)
prediction = mdl.predict(dev_data.loc[:,['AnimalType','SexuponOutcome']])

# Prediction accuracy
print 'Prediction accuracy:', np.mean(prediction == dev_labels)

In [None]:
# Make predictions of test data
prediction = mdl.predict_proba(test_data.loc[:,['AnimalType','SexuponOutcome']])

In [None]:
prediction

In [None]:
# Format an output file
#out = np.zeros((prediction.shape[0], 5))

#for i in range(out.shape[0]):
#    out[i, prediction[i]] = 1

# Convert to panda dataframe
out = pd.DataFrame(data=prediction, index=test_data.ID, columns=['Return_to_owner','Adoption','Euthanasia','Transfer','Died'])

In [None]:
out.head()

In [None]:
out.to_csv('results.csv')

### SVM

In [3]:
from sklearn import svm

In [6]:
raw_data.columns

Index([u'AnimalID', u'Name', u'DateTime', u'OutcomeType', u'OutcomeSubtype',
       u'AnimalType', u'Breed', u'Color', u'Age_days', u'Year', u'Month',
       u'Day', u'Hour', u'Minute', u'Mix', u'First_Breed', u'Second_Breed',
       u'has_name'],
      dtype='object')

In [39]:
features = raw_data.columns.drop('OutcomeType').drop('OutcomeSubtype').drop('AnimalID').drop('DateTime', 1)
pred = ['OutcomeType']

In [13]:
raw_data[features].dtypes

Name              int64
DateTime         object
AnimalType        int64
Breed             int64
Color             int64
Age_days        float64
Year              int64
Month             int64
Day               int64
Hour              int64
Minute            int64
Mix                bool
First_Breed       int64
Second_Breed      int64
has_name           bool
dtype: object

In [47]:
svm_mod.fit(test_data[features], test_data[pred])

KeyError: "['OutcomeType'] not in index"

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [None]:
le.fit(data['AnimalType'])

In [None]:
le.classes_

In [None]:
le.transform(data['AnimalType'])

In [None]:
le.fit(data['Breed'])

In [None]:
le.classes_

In [None]:
le.transform(data['Breed'])

In [None]:
data.columns

In [None]:
trans_data.head()

In [None]:
trans_data = pd.DataFrame(columns = data.columns)
for label in data.columns:
    print label
    if label != 'Age_days':
        le.fit(data[label])
        trans_data[label] = le.transform(data[label])
    else:
        trans_data[label] = data[label]

In [None]:
trans_data.head()

In [None]:
trans_data.drop('AnimalID', inplace=True, axis=1)

In [None]:
trans_data.shape

In [None]:
trans_data = trans_data.astype(float)

In [None]:
trans_data.dtypes

In [None]:
np.any(np.isfinite(trans_data))

In [None]:
np.any(np.isnan(trans_data))

In [None]:
trans_data = trans_data.dropna()

In [None]:
trans_data.shape

In [None]:
labels

In [None]:
trans_data

In [None]:
#we dropped NAs form the data, need to drop them from the labels by index
trans_labels = trans_data.merge(pd.DataFrame(labels), how='inner', left_index=True, right_index = True)['OutcomeType']

In [None]:
#Run the model
svm_mod = svm.SVC()
svm_mod.fit(trans_data[predictors], trans_labels)