# Feature Engineering
This document is to transform all the data and export it for use in predictive modeling

In [162]:
import numpy as np
import pandas as pd
import datetime

In [163]:
raw_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [164]:
raw_data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [165]:
#Separate age into a number and a string
raw_data['Age_num'] = raw_data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)
raw_data['Age_str'] = raw_data['AgeuponOutcome'].str[2:].str.strip()

test_data['Age_num'] = test_data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)
test_data['Age_str'] = test_data['AgeuponOutcome'].str[2:].str.strip()

#Create an age factor column based on each possible unit of measure
raw_data['Age_factor'] = 0
raw_data['Age_factor'][raw_data['Age_str'] == 'years'] = 365
raw_data['Age_factor'][raw_data['Age_str'] == 'months'] = 30
raw_data['Age_factor'][raw_data['Age_str'] == 'year'] = 365
raw_data['Age_factor'][raw_data['Age_str'] == 'weeks'] = 7
raw_data['Age_factor'][raw_data['Age_str'] == 'month'] = 30
raw_data['Age_factor'][raw_data['Age_str'] == 'days'] = 1
raw_data['Age_factor'][raw_data['Age_str'] == 'week'] = 7
raw_data['Age_factor'][raw_data['Age_str'] == 'day'] = 1

test_data['Age_factor'] = 0
test_data['Age_factor'][test_data['Age_str'] == 'years'] = 365
test_data['Age_factor'][test_data['Age_str'] == 'months'] = 30
test_data['Age_factor'][test_data['Age_str'] == 'year'] = 365
test_data['Age_factor'][test_data['Age_str'] == 'weeks'] = 7
test_data['Age_factor'][test_data['Age_str'] == 'month'] = 30
test_data['Age_factor'][test_data['Age_str'] == 'days'] = 1
test_data['Age_factor'][test_data['Age_str'] == 'week'] = 7
test_data['Age_factor'][test_data['Age_str'] == 'day'] = 1

#Multiply the number in the age by the factor for comparable numerical column.  Drop and derivitive columns
raw_data['Age_num'] = raw_data['Age_num'].astype(float)
raw_data['Age_days'] = raw_data['Age_num'].mul(raw_data['Age_factor'], axis='index').fillna(0)
raw_data.drop('AgeuponOutcome', 1, inplace = True)
raw_data.drop('Age_num', 1, inplace = True)
raw_data.drop('Age_str', 1, inplace = True)
raw_data.drop('Age_factor', 1, inplace = True)

test_data['Age_num'] = test_data['Age_num'].astype(float)
test_data['Age_days'] = test_data['Age_num'].mul(test_data['Age_factor'], axis='index').fillna(0)
test_data.drop('AgeuponOutcome', 1, inplace = True)
test_data.drop('Age_num', 1, inplace = True)
test_data.drop('Age_str', 1, inplace = True)
test_data.drop('Age_factor', 1, inplace = True)

#Split out sex data into fixed/neutered and gender
split_raw_data = raw_data['SexuponOutcome'].str.split().apply(pd.Series)
split_raw_data.columns = ['fixed', 'gender']
raw_data.merge(split_raw_data, how='inner', left_index=True, right_index=True)
raw_data.drop('SexuponOutcome', 1, inplace=True)

split_test_data = test_data['SexuponOutcome'].str.split().apply(pd.Series)
split_test_data.columns = ['fixed', 'gender']
test_data.merge(split_test_data, how='inner', left_index=True, right_index=True)
test_data.drop('SexuponOutcome', 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [166]:
#Build up date features
raw_data['DateTime'] = raw_data['DateTime'].astype(np.datetime64)
raw_data['Year'] = raw_data['DateTime'].map(lambda x: x.year)
raw_data['Month'] = raw_data['DateTime'].map(lambda x: x.month)
raw_data['Day'] = raw_data['DateTime'].map(lambda x: x.day)
raw_data['Hour'] = raw_data['DateTime'].map(lambda x: x.hour)
raw_data['Minute'] = raw_data['DateTime'].map(lambda x:x.minute)


test_data['DateTime'] = test_data['DateTime'].astype(np.datetime64)
test_data['Year'] = test_data['DateTime'].map(lambda x: x.year)
test_data['Month'] = test_data['DateTime'].map(lambda x: x.month)
test_data['Day'] = test_data['DateTime'].map(lambda x: x.day)
test_data['Hour'] = test_data['DateTime'].map(lambda x: x.hour)
test_data['Minute'] = test_data['DateTime'].map(lambda x:x.minute)


In [167]:
#Does the breed contain a mix?
raw_data['Mix'] = raw_data['Breed'].map(lambda x: x.find('Mix') != -1)
test_data['Mix'] = test_data['Breed'].map(lambda x: x.find('Mix') != -1)
#Split breed into first and second
raw_data['First_Breed'] = raw_data['Breed'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
test_data['First_Breed'] = raw_data['Breed'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
raw_data['Second_Breed'] = raw_data['Breed'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)
test_data['Second_Breed'] = raw_data['Breed'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)
#Does the animal have a name?
raw_data['has_name'] = raw_data['Name'].isnull()
test_data['has_name'] = test_data['Name'].isnull()

In [168]:
#Encode categorical variables as integers
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [169]:
text_cols = ['Name', 'OutcomeType', 'OutcomeSubtype', 'AnimalType', 'Breed', 'Color', 'First_Breed', 'Second_Breed']

In [170]:
raw_data.dtypes

AnimalID                  object
Name                      object
DateTime          datetime64[ns]
OutcomeType               object
OutcomeSubtype            object
AnimalType                object
Breed                     object
Color                     object
Age_days                 float64
Year                       int64
Month                      int64
Day                        int64
Hour                       int64
Minute                     int64
Mix                         bool
First_Breed               object
Second_Breed              object
has_name                    bool
dtype: object

In [171]:
test_data.dtypes

ID                       int64
Name                    object
DateTime        datetime64[ns]
AnimalType              object
Breed                   object
Color                   object
Age_days               float64
Year                     int64
Month                    int64
Day                      int64
Hour                     int64
Minute                   int64
Mix                       bool
First_Breed             object
Second_Breed            object
has_name                  bool
dtype: object

In [172]:
#transform original data into integer mappings
all_data = pd.concat([raw_data, test_data])
for label in text_cols:
    le.fit(all_data[label])
    raw_data[label] = le.transform(raw_data[label])
    if label not in ['OutcomeType', 'OutcomeSubtype']:
        print label
        test_data[label] = le.transform(test_data[label])

Name
AnimalType
Breed
Color
First_Breed
Second_Breed


In [173]:
#Export
raw_data.to_csv('train_data_engineered.csv', index=False)
test_data.to_csv('test_data_engineered.csv', index=False)