# Shelter Animal Outcomes Data Training

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import copy

## Load train data

In [2]:
train_file = "train.csv"
with open( train_file, "rb" ) as header_file:
    headers = header_file.readline().strip().split(',')
new_headers = headers[3:4] + headers[5:]
print headers
train_df = pd.read_csv( train_file, sep = ',', usecols = new_headers )
print len( train_df )
train_df.head( 5 )

['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color']
26729


Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


### Preprocessing and imputation

### AgeuponOutcome: years, months, days to months

In [3]:
# empty data count
ds_age = train_df[ 'AgeuponOutcome' ]
vals = ds_age.value_counts().sum()
print len( train_df ) - vals
print ds_age.isnull().sum()
#print ds_age.value_counts()

18
18


In [4]:
# need to deal with 0 years and nan values
# nan 18 + 0 years 22 = 40

In [5]:
def age_to_days( str_age, set_nan_value = 0 ):
    if type( str_age ) is str:
        days = 0
        las = str_age.strip().split()
        if 'year' in las[1]:
            days = int( las[0] ) * 365
        elif 'week' in las[1]:
            days = int( las[0] ) * 7
        elif 'month' in las[1]:
            days = int( las[0] ) * 30
        elif 'day' in las[1]:
            days = int( las[0] )
        else:                
            raise ValueError( "'{}' is not in the type list".format(las[1]) )
        return days
    elif str_age is np.nan:
        return set_nan_value
    else:
        raise TypeError( "{} is not supported type".format(type( str_age )) )

In [6]:
train_df['AgeuponOutcome'] = train_df['AgeuponOutcome'].apply( age_to_days )

In [7]:
# mean includes 0 is 793.5959818923267
print 'mean with 0s: ', train_df[ 'AgeuponOutcome' ].mean()
notzero = train_df[ train_df[ 'AgeuponOutcome' ] != 0 ][ 'AgeuponOutcome' ]
# mean exclues 0 is 794.785379744
print 'mean without 0s: ', notzero.mean()

mean with 0s:  793.595981892
mean without 0s:  794.785379744


In [8]:
train_df[ 'AgeuponOutcome' ] = train_df[ 'AgeuponOutcome' ].apply( lambda x : 795 if x == 0 else x )
train_df[ train_df[ 'AgeuponOutcome' ] == 0 ]
# if the DF is empty, it is working

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color


### Days to Months

In [9]:
train_df['AgeuponOutcome'] = train_df['AgeuponOutcome'].apply( lambda x: x // 30 )
train_df.head(5)

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Return_to_owner,Dog,Neutered Male,12,Shetland Sheepdog Mix,Brown/White
1,Euthanasia,Cat,Spayed Female,12,Domestic Shorthair Mix,Cream Tabby
2,Adoption,Dog,Neutered Male,24,Pit Bull Mix,Blue/White
3,Transfer,Cat,Intact Male,0,Domestic Shorthair Mix,Blue Cream
4,Transfer,Dog,Neutered Male,24,Lhasa Apso/Miniature Poodle,Tan


### SexuponOutcome imputation

In [10]:
train_df['SexuponOutcome'] = train_df['SexuponOutcome'].apply(lambda x: x.strip() if type( x ) is str else  x)
#train_df['SexuponOutcome'] = train_df['SexuponOutcome'].apply( lambda x: 'Unknown' if x == None else x )
train_df['SexuponOutcome'] = train_df['SexuponOutcome'].fillna( 'Unknown' )
print train_df['SexuponOutcome'].value_counts()
print train_df['SexuponOutcome'].value_counts().sum()
# the folling code invoke '1' means there is a nan value
# not anymore since fillna()
print len( train_df ) - train_df['SexuponOutcome'].value_counts().sum()
train_df[ train_df['SexuponOutcome'].isnull() ]

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1094
Name: SexuponOutcome, dtype: int64
26729
0


Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color


In [11]:
train_df['Sex'] = train_df['SexuponOutcome'].apply( lambda x: x.strip().split()[-1] )
asp = lambda x: 'Castrated' if x in ['Neutered', 'Spayed'] else x
train_df['Processed'] = train_df['SexuponOutcome'].apply( lambda x: x.strip().split()[0] ).apply(asp)
train_df_w = copy.deepcopy( train_df )
train_df_w.head()

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Sex,Processed
0,Return_to_owner,Dog,Neutered Male,12,Shetland Sheepdog Mix,Brown/White,Male,Castrated
1,Euthanasia,Cat,Spayed Female,12,Domestic Shorthair Mix,Cream Tabby,Female,Castrated
2,Adoption,Dog,Neutered Male,24,Pit Bull Mix,Blue/White,Male,Castrated
3,Transfer,Cat,Intact Male,0,Domestic Shorthair Mix,Blue Cream,Male,Intact
4,Transfer,Dog,Neutered Male,24,Lhasa Apso/Miniature Poodle,Tan,Male,Castrated


## Label encoding

In [12]:
from sklearn.preprocessing import LabelEncoder

fit_headers = [ 'OutcomeType', 'AnimalType', 'SexuponOutcome','Sex', 'Processed', 'Breed', 'Color' ]

# Exclude Breed and Color
fit_headers = fit_headers[:5]

lel = [ LabelEncoder() for i in range( len(fit_headers) ) ]
fitted_lel = [ l.fit( train_df[ fit_headers[i] ] ) for i, l in enumerate(lel) ]
for i, l in enumerate( fitted_lel ):
    train_df[fit_headers[i]] = l.transform( train_df[fit_headers[i]] )

In [13]:
for i in lel:
    print i.classes_

['Adoption' 'Died' 'Euthanasia' 'Return_to_owner' 'Transfer']
['Cat' 'Dog']
['Intact Female' 'Intact Male' 'Neutered Male' 'Spayed Female' 'Unknown']
['Female' 'Male' 'Unknown']
['Castrated' 'Intact' 'Unknown']


In [14]:
train_df.head()

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Sex,Processed
0,3,1,2,12,Shetland Sheepdog Mix,Brown/White,1,0
1,2,0,3,12,Domestic Shorthair Mix,Cream Tabby,0,0
2,0,1,2,24,Pit Bull Mix,Blue/White,1,0
3,4,0,1,0,Domestic Shorthair Mix,Blue Cream,1,1
4,4,1,2,24,Lhasa Apso/Miniature Poodle,Tan,1,0


In [15]:
#cols = df.columns.tolist()
#cols = cols[-1:] + cols[:-1]
#df = df[cols]  #    OR    df = df.ix[:, cols]

## Train data

In [53]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier

In [17]:
# log loss function to check out the train result
from math import log
def evaluation_SAO( prediction_set, dfy ):
    len_pred = len( prediction_set )
    eval_ret = [ [ max( min( c, (1.0 - (10 ** -15)) ), ( 10 ** -15 ) ) for c in r ] for r in prediction_set ]
    sum_c = [ sum( [ log(xc) * yc for xc, yc in zip( xr, yr ) ] ) for xr, yr in zip( eval_ret, dfy ) ]
    sum_r = sum( sum_c ) / len_pred * -1
    return sum_r

def ytolist( dfy ):
    len_outcomes = 5
    return [ [ 1 if c is int(r) else 0 for c in range( len_outcomes )  ] for r in dfy ]

## X: AnimalType, Sex, Processed

In [18]:
print train_df.columns.tolist()
dfx_asp = train_df[ ['AnimalType', 'Sex', 'Processed' ] ]
dfy = train_df[ 'OutcomeType' ]
print dfx_asp.tail()

['OutcomeType', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color', 'Sex', 'Processed']
       AnimalType  Sex  Processed
26724           0    1          1
26725           0    0          0
26726           1    1          0
26727           0    1          1
26728           0    1          1


In [19]:
xtrain, xtest, ytrain, ytest = train_test_split( dfx_asp, dfy, test_size=0.2 )

### GausianNB

In [20]:
mdl_gaussian = GaussianNB()
mdl_gaussian.fit( xtrain, ytrain )
print classification_report( ytest, mdl_gaussian.predict( xtest ) )

             precision    recall  f1-score   support

          0       0.57      0.97      0.72      2171
          1       0.00      0.00      0.00        38
          2       0.00      0.00      0.00       307
          3       0.00      0.00      0.00       950
          4       0.72      0.62      0.67      1880

avg / total       0.48      0.61      0.53      5346



  'precision', 'predicted', average, warn_for)


In [21]:
print 'train accuracy: ', mdl_gaussian.score( xtrain, ytrain )
print 'test accuracy: ', mdl_gaussian.score( xtest, ytest )
print 'submit score train: ', evaluation_SAO( mdl_gaussian.predict_proba( xtrain ), ytolist( ytrain ) )
print 'submit score test: ', evaluation_SAO( mdl_gaussian.predict_proba( xtest ), ytolist( ytest ) )

train accuracy:  0.606463078146
test accuracy:  0.613355780022
submit score train:  1.20742185545
submit score test:  1.16118523063


### Support vector machine classifier

In [22]:
mdl_svc = SVC( probability = True )
mdl_svc.fit( xtrain, ytrain )
print classification_report( ytest, mdl_svc.predict( xtest ) )

             precision    recall  f1-score   support

          0       0.57      0.97      0.72      2171
          1       0.00      0.00      0.00        38
          2       0.00      0.00      0.00       307
          3       0.00      0.00      0.00       950
          4       0.72      0.62      0.67      1880

avg / total       0.48      0.61      0.53      5346



In [23]:
print 'train accuracy: ', mdl_svc.score( xtrain, ytrain )
print 'test accuracy: ', mdl_svc.score( xtest, ytest )
print 'submit score train: ', evaluation_SAO( mdl_svc.predict_proba( xtrain ), ytolist( ytrain ) )
print 'submit score test: ', evaluation_SAO( mdl_svc.predict_proba( xtest ), ytolist( ytest ) )

train accuracy:  0.606463078146
test accuracy:  0.613355780022
submit score train:  1.02176223409
submit score test:  1.00994870647


### Logistic regression

In [24]:
mdl_logistic = LogisticRegression()
mdl_logistic.fit( xtrain, ytrain )
print classification_report( ytest, mdl_logistic.predict( xtest ) )

             precision    recall  f1-score   support

          0       0.57      0.97      0.72      2171
          1       0.00      0.00      0.00        38
          2       0.00      0.00      0.00       307
          3       0.00      0.00      0.00       950
          4       0.72      0.62      0.67      1880

avg / total       0.48      0.61      0.53      5346



In [25]:
print 'train accuracy: ', mdl_logistic.score( xtrain, ytrain )
print 'test accuracy: ', mdl_logistic.score( xtest, ytest )
print 'submit score train: ', evaluation_SAO( mdl_logistic.predict_proba( xtrain ), ytolist( ytrain ) )
print 'submit score test: ', evaluation_SAO( mdl_logistic.predict_proba( xtest ), ytolist( ytest ) )

train accuracy:  0.606463078146
test accuracy:  0.613355780022
submit score train:  1.00277511937
submit score test:  0.991438894883


## X: AnimalType, Sex, Processed, AgeuponOutcome

In [26]:
print train_df.columns.tolist()
dfx_aspa = train_df[ ['AnimalType', 'Sex', 'Processed', 'AgeuponOutcome' ] ]
dfy = train_df[ 'OutcomeType' ]
print dfx_aspa.tail()

['OutcomeType', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color', 'Sex', 'Processed']
       AnimalType  Sex  Processed  AgeuponOutcome
26724           0    1          1               1
26725           0    0          0               3
26726           1    1          0              48
26727           0    1          1               0
26728           0    1          1              12


In [27]:
xtrain, xtest, ytrain, ytest = train_test_split( dfx_asp, dfy, test_size=0.2 )

In [28]:
mdl_logistic = LogisticRegression()
mdl_logistic.fit( xtrain, ytrain )
print classification_report( ytest, mdl_logistic.predict( xtest ) )
print
print 'train accuracy: ', mdl_logistic.score( xtrain, ytrain )
print 'test accuracy: ', mdl_logistic.score( xtest, ytest )
print 'submit score train: ', evaluation_SAO( mdl_logistic.predict_proba( xtrain ), ytolist( ytrain ) )
print 'submit score test: ', evaluation_SAO( mdl_logistic.predict_proba( xtest ), ytolist( ytest ) )

             precision    recall  f1-score   support

          0       0.57      0.97      0.72      2212
          1       0.00      0.00      0.00        29
          2       0.00      0.00      0.00       296
          3       0.00      0.00      0.00       931
          4       0.73      0.61      0.66      1878

avg / total       0.49      0.62      0.53      5346


train accuracy:  0.605901884675
test accuracy:  0.615600448934
submit score train:  1.00366623844
submit score test:  0.987393607024


In [29]:
mdl_svc = SVC( probability = True )
mdl_svc.fit( xtrain, ytrain )
print classification_report( ytest, mdl_svc.predict( xtest ) )
print
print 'train accuracy: ', mdl_svc.score( xtrain, ytrain )
print 'test accuracy: ', mdl_svc.score( xtest, ytest )
print 'submit score train: ', evaluation_SAO( mdl_svc.predict_proba( xtrain ), ytolist( ytrain ) )
print 'submit score test: ', evaluation_SAO( mdl_svc.predict_proba( xtest ), ytolist( ytest ) )

             precision    recall  f1-score   support

          0       0.57      0.97      0.72      2212
          1       0.00      0.00      0.00        29
          2       0.00      0.00      0.00       296
          3       0.00      0.00      0.00       931
          4       0.73      0.61      0.66      1878

avg / total       0.49      0.62      0.53      5346


train accuracy:  0.605901884675
test accuracy:  0.615600448934
submit score train:  1.02314137506
submit score test:  1.00427512329


## X: AnimalType, AgeuponOutcome

In [30]:
print train_df.columns.tolist()
dfx_aspa = train_df[ ['AnimalType', 'AgeuponOutcome' ] ]
dfy = train_df[ 'OutcomeType' ]
print dfx_aspa.tail()

['OutcomeType', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color', 'Sex', 'Processed']
       AnimalType  AgeuponOutcome
26724           0               1
26725           0               3
26726           1              48
26727           0               0
26728           0              12


In [31]:
xtrain, xtest, ytrain, ytest = train_test_split( dfx_asp, dfy, test_size=0.2 )

In [32]:
mdl_logistic = LogisticRegression()
mdl_logistic.fit( xtrain, ytrain )
print classification_report( ytest, mdl_logistic.predict( xtest ) )
print
print 'train accuracy: ', mdl_logistic.score( xtrain, ytrain )
print 'test accuracy: ', mdl_logistic.score( xtest, ytest )
print 'submit score train: ', evaluation_SAO( mdl_logistic.predict_proba( xtrain ), ytolist( ytrain ) )
print 'submit score test: ', evaluation_SAO( mdl_logistic.predict_proba( xtest ), ytolist( ytest ) )

             precision    recall  f1-score   support

          0       0.56      0.96      0.71      2182
          1       0.00      0.00      0.00        52
          2       0.00      0.00      0.00       276
          3       0.00      0.00      0.00       941
          4       0.72      0.62      0.67      1895

avg / total       0.49      0.61      0.53      5346


train accuracy:  0.606556610391
test accuracy:  0.612981668537
submit score train:  1.00084760591
submit score test:  0.998282610202


## Extending features

In [33]:
print train_df.columns.tolist()
dfx_aspa = train_df[ ['AnimalType', 'AgeuponOutcome' ] ]
dfy = train_df[ 'OutcomeType' ]
print dfx_aspa.tail()

['OutcomeType', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color', 'Sex', 'Processed']
       AnimalType  AgeuponOutcome
26724           0               1
26725           0               3
26726           1              48
26727           0               0
26728           0              12


In [34]:
xtrain, xtest, ytrain, ytest = train_test_split( dfx_asp, dfy, test_size=0.2 )

In [35]:
mdl_logistic = LogisticRegression()
mdl_logistic.fit( xtrain, ytrain )
print classification_report( ytest, mdl_logistic.predict( xtest ) )
print
print 'train accuracy: ', mdl_logistic.score( xtrain, ytrain )
print 'test accuracy: ', mdl_logistic.score( xtest, ytest )
print 'submit score train: ', evaluation_SAO( mdl_logistic.predict_proba( xtrain ), ytolist( ytrain ) )
print 'submit score test: ', evaluation_SAO( mdl_logistic.predict_proba( xtest ), ytolist( ytest ) )

             precision    recall  f1-score   support

          0       0.56      0.97      0.71      2137
          1       0.00      0.00      0.00        41
          2       0.00      0.00      0.00       330
          3       0.00      0.00      0.00       928
          4       0.70      0.61      0.65      1910

avg / total       0.48      0.61      0.52      5346


train accuracy:  0.608427255296
test accuracy:  0.605499438833
submit score train:  0.998670180759
submit score test:  1.00798702746


## nltk for color and breed

In [36]:
print len( train_df_w )
train_df_w.head()

26729


Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Sex,Processed
0,Return_to_owner,Dog,Neutered Male,12,Shetland Sheepdog Mix,Brown/White,Male,Castrated
1,Euthanasia,Cat,Spayed Female,12,Domestic Shorthair Mix,Cream Tabby,Female,Castrated
2,Adoption,Dog,Neutered Male,24,Pit Bull Mix,Blue/White,Male,Castrated
3,Transfer,Cat,Intact Male,0,Domestic Shorthair Mix,Blue Cream,Male,Intact
4,Transfer,Dog,Neutered Male,24,Lhasa Apso/Miniature Poodle,Tan,Male,Castrated


In [37]:
xclr = train_df_w[ 'Color' ]
xbrd = train_df_w[ 'Breed' ]
xat = train_df_w[ 'AnimalType' ]
xso = train_df_w[ 'Sex' ]
xsp = train_df_w[ 'Processed' ]

yot = train_df_w[ 'OutcomeType']

rep = lambda x : x.replace( '/', ' ' ).strip().split()
spl = lambda x : x.split() if type(x) is str else ['none']
xclr = xclr.apply( rep )
xbrd = xbrd.apply( rep )
xso = xso.apply( spl )
xsp = xsp.apply( spl )
xat = xat.apply( spl )

print xclr.head(3), len( xclr )
print xbrd.head(3), len( xbrd )
print xso.head(3), len( xso )
print xsp.head(3), len( xsp )
print xat.head(3), len( xat )

0    [Brown, White]
1    [Cream, Tabby]
2     [Blue, White]
Name: Color, dtype: object 26729
0     [Shetland, Sheepdog, Mix]
1    [Domestic, Shorthair, Mix]
2              [Pit, Bull, Mix]
Name: Breed, dtype: object 26729
0      [Male]
1    [Female]
2      [Male]
Name: Sex, dtype: object 26729
0    [Castrated]
1    [Castrated]
2    [Castrated]
Name: Processed, dtype: object 26729
0    [Dog]
1    [Cat]
2    [Dog]
Name: AnimalType, dtype: object 26729


In [38]:
# Words to dict
def feature_format(words):
    return dict([(word, True) for word in words] )

In [39]:
labeled_brd = [ (feature_format( c ), str( y )) for c, y in zip( xbrd, yot ) ]
labeled_brd[:5]

[({'Mix': True, 'Sheepdog': True, 'Shetland': True}, 'Return_to_owner'),
 ({'Domestic': True, 'Mix': True, 'Shorthair': True}, 'Euthanasia'),
 ({'Bull': True, 'Mix': True, 'Pit': True}, 'Adoption'),
 ({'Domestic': True, 'Mix': True, 'Shorthair': True}, 'Transfer'),
 ({'Apso': True, 'Lhasa': True, 'Miniature': True, 'Poodle': True},
  'Transfer')]

In [40]:
procd_train = \
[ [ feature_format( b + s + p + a ), str( y ) ] for b, s, p, a, y in zip( xbrd, xso, xsp, xat, yot ) ]

In [41]:
import nltk
from nltk.classify import NaiveBayesClassifier

pt_classifier = nltk.NaiveBayesClassifier.train( procd_train )
print( nltk.classify.accuracy( pt_classifier, procd_train ) )
print pt_classifier.show_most_informative_features()

0.48355718508
Most Informative Features
                 Unknown = True             Died : Return =     37.4 : 1.0
                Brittany = True             Died : Transf =     28.6 : 1.0
               Castrated = None             Died : Adopti =     24.2 : 1.0
                   Akita = True             Died : Transf =     20.4 : 1.0
                  Intact = True             Died : Adopti =     20.4 : 1.0
               Himalayan = True             Died : Transf =     15.9 : 1.0
                  Bichon = True           Return : Transf =     15.1 : 1.0
                   Frise = True           Return : Transf =     15.1 : 1.0
                    Manx = True             Died : Return =     14.5 : 1.0
                Standard = True             Died : Transf =     14.0 : 1.0
None


In [42]:
def get_classifier_prob( classifier, dfx ):
    dfx_processed = dfx.apply( feature_format )
    labels = classifier.labels()
    probs = []
    for i in dfx_processed:
        probs.append( [classifier.prob_classify( i ).prob( lbl ) for lbl in labels] )    
    
    return probs

In [43]:
dfx = [ b + s + p + a for b, s, p, a, y in zip( xbrd, xso, xsp, xat, yot ) ]
dss = pd.DataFrame( data = {'BAPS':dfx} )[ 'BAPS' ]
dss.head()

0      [Shetland, Sheepdog, Mix, Male, Castrated, Dog]
1    [Domestic, Shorthair, Mix, Female, Castrated, ...
2               [Pit, Bull, Mix, Male, Castrated, Dog]
3        [Domestic, Shorthair, Mix, Male, Intact, Cat]
4    [Lhasa, Apso, Miniature, Poodle, Male, Castrat...
Name: BAPS, dtype: object

In [44]:
baps_prob = get_classifier_prob( pt_classifier, dss )
print baps_prob[:3]

[[0.03339241397059475, 0.22569924581126594, 0.7119017580173164, 0.0010514089234651056, 0.027955173277357744], [0.4783557483292614, 0.47770905503042643, 0.004423230211185791, 0.008339230051780723, 0.031172736377344943], [0.03894975321870475, 0.20339490336365945, 0.6593835050204871, 0.00013253426710030418, 0.09813930413004789]]


In [45]:
pt_classifier.labels()

['Transfer', 'Adoption', 'Return_to_owner', 'Died', 'Euthanasia']

In [46]:
# inplace swap
def swap( target, t1, t2 ):
    target[t1], target[t2] = target[t2], target[t1]

swap_seq = ((1,0), (3,1), (4,2), (4,3))

def seq_swap( target, seq = swap_seq):
    for s in seq:
        swap( target, s[0], s[1] )
        
def combine_probs( rslts, ratio_list ):
    return [[sum([rslts[k][i][j] * ratio_list[k] for k in range(len(rslts))]) \
      for j in range(len(rslts[0][i]))] for i in range(len(rslts[0]))]

In [47]:
print train_df.columns.tolist()
dfx_aspa = train_df[ ['AnimalType', 'Sex', 'Processed']]#, 'AgeuponOutcome' ] ]
dfy = train_df[ 'OutcomeType' ]
print dfx_aspa.tail()

['OutcomeType', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color', 'Sex', 'Processed']
       AnimalType  Sex  Processed
26724           0    1          1
26725           0    0          0
26726           1    1          0
26727           0    1          1
26728           0    1          1


In [48]:
mdl_logistic = LogisticRegression()
mdl_logistic.fit( xtrain, ytrain )
print classification_report( ytrain, mdl_logistic.predict( xtrain ) )
print mdl_logistic.predict_proba( xtrain )

             precision    recall  f1-score   support

          0       0.56      0.97      0.71      8632
          1       0.00      0.00      0.00       156
          2       0.00      0.00      0.00      1225
          3       0.00      0.00      0.00      3858
          4       0.72      0.62      0.67      7512

avg / total       0.48      0.61      0.52     21383

[[ 0.4904458   0.00188968  0.03555986  0.31590397  0.15620068]
 [ 0.03914637  0.00944749  0.12926211  0.18749163  0.6346524 ]
 [ 0.65277347  0.00463521  0.02637193  0.06263962  0.25357977]
 ..., 
 [ 0.51755073  0.00255572  0.04352527  0.27061326  0.16575502]
 [ 0.07243515  0.0229778   0.09848969  0.03231524  0.77378212]
 [ 0.03383372  0.00699101  0.10744498  0.22225196  0.62947834]]


In [49]:
# Can not distinct between rto and adpt
# 1, 2 places share others' prob
# ASP has highter score than ASPA

### OvR

In [60]:
ovrclf = OneVsRestClassifier( LogisticRegression(), n_jobs = 5 )
ovrclf.fit( xtrain, ytrain )
print classification_report( ytrain, ovrclf.predict( xtrain ) )
print ovrclf.predict_proba( xtrain )
print 'train accuracy: ', ovrclf.score( xtrain, ytrain )
print 'test accuracy: ', ovrclf.score( xtest, ytest )
print 'submit score train: ', evaluation_SAO( ovrclf.predict_proba( xtrain ), ytolist( ytrain ) )
print 'submit score test: ', evaluation_SAO( ovrclf.predict_proba( xtest ), ytolist( ytest ) )

             precision    recall  f1-score   support

          0       0.56      0.97      0.71      8632
          1       0.00      0.00      0.00       156
          2       0.00      0.00      0.00      1225
          3       0.00      0.00      0.00      3858
          4       0.72      0.62      0.67      7512

avg / total       0.48      0.61      0.52     21383

[[ 0.4904458   0.00188968  0.03555986  0.31590397  0.15620068]
 [ 0.03914637  0.00944749  0.12926211  0.18749163  0.6346524 ]
 [ 0.65277347  0.00463521  0.02637193  0.06263962  0.25357977]
 ..., 
 [ 0.51755073  0.00255572  0.04352527  0.27061326  0.16575502]
 [ 0.07243515  0.0229778   0.09848969  0.03231524  0.77378212]
 [ 0.03383372  0.00699101  0.10744498  0.22225196  0.62947834]]
train accuracy:  0.608427255296
test accuracy:  0.605499438833
submit score train:  0.998670180759
submit score test:  1.00798702746


### Adoption vs Return to owner

In [84]:
avsr_df = train_df[ ['OutcomeType', 'AnimalType', 'AgeuponOutcome'] ]
avsrs = avsr_df.groupby( 'OutcomeType' )