# Feature Engineering

Turn the training and test sets into feature matricies.

In [71]:
import datetime
import json
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

Delete unneeded columns, and split the outcomes into separate numpy arrays.

In [72]:
def create_outcomes_drop_cols(train, test):
    le = preprocessing.LabelEncoder()
    le.fit(train['OutcomeType'])
    outcomes = le.transform(train['OutcomeType'])
    train = train.drop(['AnimalID', 'OutcomeType', 'OutcomeSubtype'], axis = 1)
    test = test.drop(['ID'], axis = 1)
    return train, test, outcomes, le

train, test, outcomes, outcomes_le = create_outcomes_drop_cols(
    train, test)

Create the sex, intact/neutered, date, and age variables.

In [73]:
def create_sex_variables(data):
    SexuponOutcome = data['SexuponOutcome'].fillna('Unknown')
    results = []
    for row in SexuponOutcome:
        row = row.split(' ')
        if len(row) == 1:
            row = ['Unknown', 'Unknown']
        results.append(row)
    NeuteredSprayed, Sex = zip(
        *[['Neutered', x[1]] if x[0] == 'Spayed' else x for x in results])
    return (data.assign(Neutered = NeuteredSprayed).assign(Sex = Sex)
            .drop(['SexuponOutcome'], axis = 1))

def create_age_in_years(data):
    ages = list(data['AgeuponOutcome'].fillna('NA'))
    results = []
    units = {'days': 365.0, 'weeks': 52.0, 'months': 12.0}
    for age in ages:
        if age == 'NA':
            results.append('NA')
        else:
            duration, unit = age.split(' ')
            results.append(float(duration) / units.get(unit, 1.0))
    impute = np.mean([age for age in results if age != 'NA'])
    return (data
            .assign(Age = [age if age != 'NA' else impute for age in results])
            .drop(['AgeuponOutcome'], axis = 1))

def time_of_day(hour):
    if hour > 4 and hour < 12:
        return 'morning'
    elif hour >= 12 and hour < 18:
        return 'afternoon'
    else:
        return 'evening/night'
    
def day_of_the_week(DateTime):
    return datetime.datetime.strptime(DateTime, '%Y-%m-%d %H:%M:%S').weekday()

def create_date_variables(data):
    return (data
            .assign(Year = data.DateTime.map(lambda x: x[:4]))
            .assign(Month = data.DateTime.map(lambda x: x[5:7]))
            .assign(Day = data.DateTime.map(lambda x: day_of_the_week(x)))
            .assign(TimeOfDay = data.DateTime.map(
                lambda x: time_of_day(int(x[11:13]))))
            .drop(['DateTime'], axis = 1))

train = (train
         .pipe(create_sex_variables)
         .pipe(create_date_variables)
         .pipe(create_age_in_years))
test = (test
        .pipe(create_sex_variables)
        .pipe(create_date_variables)
        .pipe(create_age_in_years))

Create the Breed variables.

In [74]:
def assign_dog_breeds(data):
    
    # Read in the dog breeds JSON file
    with open('dogbreeds.json') as data_file:    
        dogbreeds = json.load(data_file)
    unique_breeds = set([breed for breeds in dogbreeds.values() for breed in breeds])
    breed_to_position = dict([(x, i) for i, x in enumerate(unique_breeds)])
        
    # Create indicator variables for the different breed types
    vectors = []
    for breed, animal in data[['Breed', 'AnimalType']].values.tolist():
        vector = [0] * len(unique_breeds)
        if animal == 'Dog':
            breed = dogbreeds[breed]
            for subbreed in breed:
                vector[breed_to_position[subbreed]] += 1
        vectors.append(vector)
    columns = [x[1] for x in sorted([(v, k) for k, v in breed_to_position.items()])]
    dogbreeds_df = pd.DataFrame(vectors, columns = columns)
    return pd.concat([data, dogbreeds_df], axis = 1)


def create_mix(data):
    data['NumDogBreeds'] = data[['Herding', 'Toy', 'Working', 'Sporting', 
       'Non-Sporting', 'Hound', 'Terrier']].sum(axis = 1)
    data['Mix'] = data['Breed'].map(lambda x: x.find('Mix'))
    data['Mix'] = np.where(data['Mix'] > 0, 1, 
                            np.where(data['NumDogBreeds'] > 1, 1, 0))
    return data


def create_hair_length(data):
    data['ShortHair'] = np.where(data['Breed'].map(
            lambda x: x.find('Short')) > 1, np.where(
            data['AnimalType'] == 'Cat', 1, 0), 0)
    data['MediumHair'] = np.where(data['Breed'].map(
            lambda x: x.find('Medium')) > 1, np.where(
            data['AnimalType'] == 'Cat', 1, 0), 0)
    data['LongHair'] = np.where(data['Breed'].map(
            lambda x: x.find('Long')) > 1, np.where(
            data['AnimalType'] == 'Cat', 1, 0), 0)
    return data


def create_breed_variables(data):
    return (data.pipe(assign_dog_breeds)
            .pipe(create_mix)
            .pipe(create_hair_length)
            .drop(['Breed'], axis = 1))
    
    
train = train.pipe(create_breed_variables)
test = test.pipe(create_breed_variables)

Color variable.

In [75]:
def extract_unique_colors(train):
    """
    Extract a set of unique colors from the training set with
    30 or more animals.
    """
    colors = {}
    for color in list(train['Color']):
        color = re.split('\W+', color)
        for subcolor in color:
            try:
                colors[subcolor] += 1
            except:
                colors[subcolor] = 1
    return set([k for k, v in colors.items() if v >= 30])


def create_color_variables(data, colors):
    vectors = []
    to_position = dict([(c, i) for i, c in enumerate(colors)])
    for color in list(data['Color']):
        vector = [0] * len(colors)
        color = re.split('\W+', color)
        for subcolor in color:
            try:
                vector[to_position[subcolor]] += 1
            except:
                pass
        vectors.append(vector)
    columns = ['Color_' + x[1] for x in sorted(
            [(v, k) for k, v in to_position.items()])]
    colors_df = pd.DataFrame(vectors, columns = columns)
    return pd.concat([data, colors_df], axis = 1).drop(['Color'], axis = 1)


colors = extract_unique_colors(train)
train = train.pipe(create_color_variables, colors = colors)
test = test.pipe(create_color_variables, colors = colors)

Whether or not the animal has a name.

In [78]:
def create_has_name(data):
    data['HasName'] = np.where(data['Name'].isnull(), 0, 1)
    return data.drop(['Name'], axis = 1)

train = train.pipe(create_has_name)
test = test.pipe(create_has_name)

Turn `AnimalType` into a binary variable.

In [79]:
def transform_animal_type(data):
    

Unnamed: 0,AnimalType,Neutered,Sex,Year,Month,Day,TimeOfDay,Age,Herding,Toy,...,Color_Gold,Color_Point,Color_Tricolor,Color_Yellow,Color_Silver,Color_Seal,Color_Brindle,Color_Tan,Color_Cream,HasName
0,Dog,Neutered,Male,2014,02,2,evening/night,1.000000,1,0,...,0,0,0,0,0,0,0,0,0,1
1,Cat,Neutered,Female,2013,10,6,afternoon,1.000000,0,0,...,0,0,0,0,0,0,0,0,1,1
2,Dog,Neutered,Male,2015,01,5,afternoon,2.000000,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Cat,Intact,Male,2014,07,4,evening/night,0.057692,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Dog,Neutered,Male,2013,11,4,afternoon,2.000000,0,1,...,0,0,0,0,0,0,0,1,0,0
5,Dog,Intact,Female,2014,04,4,afternoon,1.000000,0,1,...,0,0,0,0,0,0,0,1,0,1
6,Cat,Intact,Male,2015,03,5,afternoon,0.057692,0,0,...,0,0,0,0,0,0,0,0,0,1
7,Cat,Unknown,Unknown,2015,04,3,afternoon,0.057692,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Dog,Neutered,Female,2014,02,1,afternoon,0.416667,0,0,...,0,0,0,0,0,0,0,0,0,1
9,Dog,Neutered,Female,2014,05,5,morning,1.000000,0,0,...,0,0,0,0,0,0,0,0,0,0
