# Feature Engineering

Turn the training and test sets into feature matricies.

In [1]:
import datetime
import json
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

Delete unneeded columns, and split the outcomes into separate numpy arrays.

In [2]:
def create_outcomes_drop_cols(train, test):
    """
    Delete unneeded columns, and split the outcomes into separate numpy arrays.
    """
    le = preprocessing.LabelEncoder()
    le.fit(train['OutcomeType'])
    outcomes = le.transform(train['OutcomeType'])
    train = train.drop(['AnimalID', 'OutcomeType', 'OutcomeSubtype'], axis = 1)
    test = test.drop(['ID'], axis = 1)
    return train, test, outcomes, le

Create the sex, intact/neutered, date, and age variables.

In [3]:
def create_sex_variables(data):
    """
    Create Sex and Neutered features from SexuponOutcome, which was really 
    two features in one - gender and neutered/intact. 
    """
    SexuponOutcome = data['SexuponOutcome'].fillna('Unknown')
    results = []
    for row in SexuponOutcome:
        row = row.split(' ')
        if len(row) == 1:
            row = ['Unknown', 'Unknown']
        results.append(row)
    NeuteredSprayed, Sex = zip(
        *[['Neutered', x[1]] if x[0] == 'Spayed' else x for x in results])
    return (data.assign(Neutered = NeuteredSprayed).assign(Sex = Sex)
        .drop(['SexuponOutcome'], axis = 1))

def create_age_in_years(data):
    """
    Transform the AgeuponOutcome variable into a numeric, impute the very small
    number of missing values with the median.
    """
    ages = list(data['AgeuponOutcome'].fillna('NA'))
    results = []
    units = {'days': 365.0, 'weeks': 52.0, 'months': 12.0}
    for age in ages:
        if age == 'NA':
            results.append('NA')
        else:
            duration, unit = age.split(' ')
            results.append(float(duration) / units.get(unit, 1.0))
    impute = np.median([age for age in results if age != 'NA'])
    ages = [age if age != 'NA' else impute for age in results]
    return (data
            .assign(Age = preprocessing.scale(ages))
            .drop(['AgeuponOutcome'], axis = 1))

def time_of_day(hour):
    if hour > 4 and hour < 12:
        return 'morning'
    elif hour >= 12 and hour < 18:
        return 'afternoon'
    else:
        return 'evening/night'
    
def day_of_the_week(DateTime):
    return datetime.datetime.strptime(DateTime, '%Y-%m-%d %H:%M:%S').weekday()

def create_date_variables(data):
    return (data
            .assign(Year = data.DateTime.map(lambda x: x[:4]))
            .assign(Month = data.DateTime.map(lambda x: x[5:7]))
            .assign(Day = data.DateTime.map(lambda x: day_of_the_week(x)))
            .assign(TimeOfDay = data.DateTime.map(
                lambda x: time_of_day(int(x[11:13]))))
            .drop(['DateTime'], axis = 1))

train = (train
         .pipe(create_sex_variables)
         .pipe(create_date_variables)
         .pipe(create_age_in_years))
test = (test
        .pipe(create_sex_variables)
        .pipe(create_date_variables)
        .pipe(create_age_in_years))

Create the Breed variables.

In [4]:
def assign_dog_breeds(data):
    
    # Read in the dog breeds JSON file
    with open('dogbreeds.json') as data_file:    
        dogbreeds = json.load(data_file)
    unique_breeds = set([breed for breeds in dogbreeds.values() for breed in breeds])
    breed_to_position = dict([(x, i) for i, x in enumerate(unique_breeds)])
        
    # Create indicator variables for the different breed types
    vectors = []
    for breed, animal in data[['Breed', 'AnimalType']].values.tolist():
        vector = [0] * len(unique_breeds)
        if animal == 'Dog':
            breed = dogbreeds[breed]
            for subbreed in breed:
                vector[breed_to_position[subbreed]] += 1
        vectors.append(vector)
    columns = [x[1] for x in sorted([(v, k) for k, v in breed_to_position.items()])]
    dogbreeds_df = pd.DataFrame(vectors, columns = columns)
    return pd.concat([data, dogbreeds_df], axis = 1)


def create_hair_length_variable(data, hairlen):
    return np.where(data['Breed'].str.contains(hairlen, case = False), 
        np.where(data['AnimalType'] == 'Cat', 1, 0), 0)


def create_hair_length(data):
    """
    For cats, creates binary indicator variables for whether the cat has long,
    medium, or short hair.
    """
    data['ShortHair'] = create_hair_length_variable(data, 'Short')
    data['MediumHair'] = create_hair_length_variable(data, 'Medium')
    data['LongHair'] = create_hair_length_variable(data, 'Long')
    return data


def identify_common_breeds(breeds, threshold = 30):
    breed_counts = {}
    for breed in breeds:
        breed = breed.replace(' Mix', '').replace(' mix', '').split('/')
        for subbreed in breed:
            try:
                breed_counts[subbreed] += 1
            except:
                breed_counts[subbreed] = 1
    return set([k for k, v in breed_counts.items() if v >= threshold])


def create_specific_breeds(data, dog_breeds, cat_breeds):
    vectors = []
    common_breeds = list(dog_breeds) + list(cat_breeds)
    to_position = dict([(c, i) for i, c in enumerate(common_breeds)])
    for breed, animal in data[['Breed', 'AnimalType']].values.tolist():
        vector = [0] * len(common_breeds)
        breed = breed.replace(' Mix', '').replace(' mix', '').split('/')
        for subbreed in breed:
            try:
                vector[to_position[subbreed]] += 1
            except:
                pass
        vectors.append(vector)
    columns = ['SpecificBreed_' + x[1].replace(' ', '') for x in sorted(
            [(v, k) for k, v in to_position.items()])]
    breeds_df = pd.DataFrame(vectors, columns = columns)
    return pd.concat([data, breeds_df], axis = 1)


def create_mix(data):
    """
    For both cats and dogs, is the animal a mix of multiple breeds? We can 
    determine this both by searching for the string 'Mix' in the name, AND by 
    looking to see if dogs have been classified into 2 or more AKC groups.
    """
    akc_class_cols = [col for col in list(data) if 'AKC_Class_' in col]
    specific_breed_cols = [col for col in list(data) if 'SpecificBreed_' in col]
    data['NumAKCClassses'] = data[akc_class_cols].sum(axis = 1)
    data['NumSpecificBreeds'] = data[specific_breed_cols].sum(axis = 1)
    data['Mix'] = np.where(data['NumAKCClassses'] > 1, 1, 0)
    data['Mix'] = np.where(data['NumSpecificBreeds'] > 1, 1, data['Mix'])
    data['Mix'] = np.where(data['Breed'].str.contains('Mix', case = False), 1, data['Mix'])
    return data.drop(['NumAKCClassses', 'NumSpecificBreeds'], axis = 1)
    

def create_breed_variables(data, common_dog_breeds, common_cat_breeds):
    return (data.pipe(assign_dog_breeds)
            .pipe(create_hair_length)
            .pipe(create_specific_breeds, dog_breeds = common_dog_breeds, 
                 cat_breeds = common_cat_breeds)
            .pipe(create_mix)
            .drop(['Breed'], axis = 1))
    
common_dog_breeds = identify_common_breeds(
    list(train[train['AnimalType'] == 'Dog']['Breed']))
common_cat_breeds = identify_common_breeds(
    list(train[train['AnimalType'] == 'Cat']['Breed']))
train = train.pipe(create_breed_variables, common_dog_breeds = common_dog_breeds, 
                   common_cat_breeds = common_cat_breeds)
test = test.pipe(create_breed_variables, common_dog_breeds = common_dog_breeds, 
                 common_cat_breeds = common_cat_breeds)

Color variable.

In [5]:
def extract_unique_colors(train):
    """
    Extract a set of unique colors from the training set with
    30 or more animals.
    """
    colors = {}
    for color in list(train['Color']):
        color = re.split('\W+', color)
        for subcolor in color:
            try:
                colors[subcolor] += 1
            except:
                colors[subcolor] = 1
    return set([k for k, v in colors.items() if v >= 30])


def create_color_variables(data, colors):
    vectors = []
    to_position = dict([(c, i) for i, c in enumerate(colors)])
    for color in list(data['Color']):
        vector = [0] * len(colors)
        color = re.split('\W+', color)
        for subcolor in color:
            try:
                vector[to_position[subcolor]] += 1
            except:
                pass
        vectors.append(vector)
    columns = ['Color_' + x[1] for x in sorted(
            [(v, k) for k, v in to_position.items()])]
    colors_df = pd.DataFrame(vectors, columns = columns)
    return pd.concat([data, colors_df], axis = 1).drop(['Color'], axis = 1)


colors = extract_unique_colors(train)
train = train.pipe(create_color_variables, colors = colors)
test = test.pipe(create_color_variables, colors = colors)

Whether or not the animal has a name.

In [6]:
def create_has_name(data):
    data['HasName'] = np.where(data['Name'].isnull(), 0, 1)
    return data.drop(['Name'], axis = 1)

train = train.pipe(create_has_name)
test = test.pipe(create_has_name)

Turn `AnimalType` into a binary variable.

In [7]:
def transform_animal_type(data):
    data['AnimalType'] = np.where(data['AnimalType'] == 'Cat', 1, 0)
    return data

train = train.pipe(transform_animal_type)
test = test.pipe(transform_animal_type)

One-hot encode `Neutered`, `Sex`, `Year`, `Month`, `Day`, and `TimeOfDay`.

In [8]:
def one_hot_encode(DataFrame, column):
    """
    Replace [column] in [DataFrame] with binary columns for each distinct value
    in [column], each with the name [column]_[value].
    """
    to_col = dict([(n, i) for i, n in enumerate(list(DataFrame[column].unique()))])
    mat = np.zeros((len(DataFrame.index), len(to_col)))
    for i, val in enumerate(list(DataFrame[column])):
        mat[i, to_col[val]] += 1
    columns = [column + '_' + str(x[1]) for x in sorted(
            [(v, k) for k, v in to_col.items()])]
    dfs = [DataFrame, pd.DataFrame(mat.astype(int), columns = columns)]
    return pd.concat(dfs, axis = 1).drop([column], axis = 1)


def do_one_hot_encoding(data):
    columns = list(data)[1:7]
    for column in columns:
        data = data.pipe(one_hot_encode, column = column)
    return data


train = train.pipe(do_one_hot_encoding)
test = test.pipe(do_one_hot_encoding)

Export the data