In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.csv.gz", compression = "gzip", 
                    dtype = {"AnimalID" : str,
                            "Name" : str,
                            "DateTime" : object,
                            "OutcomeType" : str,
                            "OutcomeSubtype" : str,
                            "AnimalType" : str,
                            "SexuponOutcome" : str,
                            "AgeuponOutcome" : str,
                            "Breed" : str,
                            "Color" : str})
test = pd.read_csv("test.csv.gz", compression = "gzip",
                   dtype = {"AnimalID" : str,
                            "Name" : str,
                            "DateTime" : object,
                            "AnimalType" : str,
                            "SexuponOutcome" : str,
                            "AgeuponOutcome" : str,
                            "Breed" : str,
                            "Color" : str})

In [3]:
# See if pet has a name
def has_name(name):
    if pd.isnull(name):
        return False
    else:
        return True

In [4]:
# Extract year, month, day, hour from datetime
def extract_datetime(dataset):
    dataset['Year'] = pd.DatetimeIndex(dataset.DateTime).year
    dataset['Month'] = pd.DatetimeIndex(dataset.DateTime).month
    dataset['Day'] = pd.DatetimeIndex(dataset.DateTime).day
    dataset['Hour'] = pd.DatetimeIndex(dataset.DateTime).hour
    dataset['DayofWeek'] = pd.DatetimeIndex(dataset.DateTime).dayofweek
    return dataset

In [5]:
def is_altered(SexuponOutcome):
    if "unknown" in SexuponOutcome.lower():
        return "Unknown"
    elif "intact" in SexuponOutcome.lower():
        return "Intact"
    else:
        return "Altered"

In [6]:
def gender(SexuponOutcome):
    if "unknown" in SexuponOutcome.lower():
        return "Unknown"
    elif "female" in SexuponOutcome.lower():
        return "Female"
    else:
        return "Male"

In [8]:
def breed_type(breed):
    #if "/" in breed or "mix" in breed.lower():
    if "mix" in breed.lower():
        return "Mixed Breed"
    else:
        return "Full Breed"

In [9]:
def clean_age(x):
    if x == 'nan': return 0
    age = int(x.split()[0])
    if x.find('year') > -1: return age 
    if x.find('month')> -1: return age / 12.
    if x.find('week')> -1: return age / 52.
    if x.find('day')> -1: return age / 365.
    else: return 0

In [10]:
def clean_data(data):
    data['HasName'] = data.Name.apply(has_name)
    extract_datetime(data)
    data['IsAltered'] = data.SexuponOutcome.astype(str).apply(is_altered)
    data['Gender'] = data.SexuponOutcome.astype(str).apply(gender)
    data['BreedType'] = data.Breed.astype(str).apply(breed_type)
    data['CleanAge'] = data.AgeuponOutcome.astype(str).apply(clean_age)
    return data

In [11]:
clean_train = clean_data(train)
print(clean_train.head(10))
clean_train.to_csv("clean_{0}.csv".format("train"))

  AnimalID     Name             DateTime      OutcomeType OutcomeSubtype  \
0  A671945  Hambone  2014-02-12 18:22:00  Return_to_owner            NaN   
1  A656520    Emily  2013-10-13 12:44:00       Euthanasia      Suffering   
2  A686464   Pearce  2015-01-31 12:28:00         Adoption         Foster   
3  A683430      NaN  2014-07-11 19:09:00         Transfer        Partner   
4  A667013      NaN  2013-11-15 12:52:00         Transfer        Partner   
5  A677334     Elsa  2014-04-25 13:04:00         Transfer        Partner   
6  A699218    Jimmy  2015-03-28 13:11:00         Transfer        Partner   
7  A701489      NaN  2015-04-30 17:02:00         Transfer        Partner   
8  A671784     Lucy  2014-02-04 17:17:00         Adoption            NaN   
9  A677747      NaN  2014-05-03 07:48:00         Adoption        Offsite   

  AnimalType SexuponOutcome AgeuponOutcome                              Breed  \
0        Dog  Neutered Male         1 year              Shetland Sheepdog Mix   
1

In [12]:
clean_test = clean_data(test)
print(clean_test.head(10))
clean_test.to_csv("clean_{0}.csv".format("test"))

   ID        Name             DateTime AnimalType SexuponOutcome  \
0   1      Summer  2015-10-12 12:15:00        Dog  Intact Female   
1   2    Cheyenne  2014-07-26 17:59:00        Dog  Spayed Female   
2   3         Gus  2016-01-13 12:20:00        Cat  Neutered Male   
3   4       Pongo  2013-12-28 18:12:00        Dog    Intact Male   
4   5     Skooter  2015-09-24 17:59:00        Dog  Neutered Male   
5   6        Beau  2015-06-23 11:17:00        Dog  Neutered Male   
6   7        Bobo  2014-03-12 09:45:00        Cat  Neutered Male   
7   8        Abby  2014-06-25 08:27:00        Cat  Spayed Female   
8   9  Ruby Grace  2014-11-12 18:05:00        Dog  Spayed Female   
9  10        Ruby  2014-04-07 17:41:00        Dog  Spayed Female   

  AgeuponOutcome                           Breed              Color  HasName  \
0      10 months          Labrador Retriever Mix          Red/White     True   
1        2 years  German Shepherd/Siberian Husky          Black/Tan     True   
2         1