In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [17]:
TRAIN_TABULAR = "data/train.csv"
BREED_LABELS =  "data/BreedLabels.csv"
COLOR_LABELS = "data/ColorLabels.csv"
STATE_LABELS = "data/StateLabels.csv"

train_tabular_df = pd.read_csv(TRAIN_TABULAR)
breed_labels_df = pd.read_csv(BREED_LABELS)
color_labels_df = pd.read_csv(COLOR_LABELS)
state_labels_df = pd.read_csv(STATE_LABELS)

In [18]:
# Flag_Name 
transformed_df = train_tabular_df.copy()
transformed_df['Flag_Name'] = transformed_df["Name"].isna() == False

# Breed1, Breed2 labels    
transformed_df = pd.merge(train_tabular_df, breed_labels_df, left_on='Breed1', right_on="BreedID", how='inner').rename({"BreedName": "Breed1Name"}, axis=1)
transformed_df = pd.merge(transformed_df, breed_labels_df, left_on='Breed2', right_on="BreedID", how='inner').rename({"BreedName": "Breed2Name"}, axis=1)

# Color1, Color2, Color3 labels
transformed_df = pd.merge(transformed_df, color_labels_df, left_on='Color1', right_on="ColorID", how='inner').rename({"ColorName": "Color1Name"}, axis=1)
transformed_df = pd.merge(transformed_df, color_labels_df, left_on='Color2', right_on="ColorID", how='inner').rename({"ColorName": "Color2Name"}, axis=1)
transformed_df = pd.merge(transformed_df, color_labels_df, left_on='Color3', right_on="ColorID", how='inner').rename({"ColorName": "Color3Name"}, axis=1)

# State labels
transformed_df = pd.merge(transformed_df, state_labels_df, left_on='State', right_on="StateID", how='inner')

# Breed1_Breed2 + concatenated labels
transformed_df['Breed1_Breed2'] = transformed_df.Breed1.map(str) + "_" + transformed_df.Breed2.map(str)
transformed_df['Breed1Name_Breed2Name'] = transformed_df.Breed1Name.map(str) + "_" + transformed_df.Breed2Name.map(str)

# Color1_Color2_Color3 + concatenated labels
transformed_df['Color1_Color2_Color3'] = transformed_df.Breed1.map(str) + "_" + transformed_df.Breed2.map(str)
transformed_df['Color1Name_Color2Name_Color3Name'] = transformed_df.Color1Name.map(str) + "_" + transformed_df.Color2Name.map(str) + "_" + transformed_df.Color3Name.map(str)

# RescuerID Counts
rescuerid_counts = pd.DataFrame(transformed_df.groupby(["RescuerID"]).size(), columns=["RescuerID_Count"]).reset_index()
transformed_df = pd.merge(transformed_df, rescuerid_counts, on="RescuerID")

In [19]:
# drop redundant cols
cols_to_drop = [
    'BreedID_x',
    'Type',
    'BreedID_y',
    'Type_y',
    'ColorID_x',
    'ColorID_y',
    'ColorID',
    'StateID'
]

transformed_df = transformed_df.drop(columns=cols_to_drop)
transformed_df = transformed_df.rename({'Type_x': 'Type'}, axis=1)

In [20]:
# create age bins
age_bins = pd.cut(transformed_df['Age'], bins=[0,3,6,12,24,300], include_lowest=True)
age_bin_dummies = pd.get_dummies(age_bins)
age_bin_dummies.columns = ["is_age_0_3", "is_age_3_6", "is_age_6_12","is_age_12_24", "is_age_24_300"]
transformed_df = pd.concat([transformed_df, age_bin_dummies], axis=1)

In [21]:
# one-hot encode dummy variables

dummy_cols = [
    'Type',
    'Gender',
    'MaturitySize',
    'Vaccinated',
    'Dewormed',
    'Sterilized',
    'Health',
    'FurLength',
    'State',
    'Breed1',
    'Breed2',
    'Color1',
    'Color2',
    'Color3',
]

transformed_df = pd.get_dummies(transformed_df, columns=dummy_cols)
transformed_df.head()

Unnamed: 0,Name,Age,Quantity,Fee,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,...,Color2_2,Color2_3,Color2_4,Color2_5,Color2_6,Color3_3,Color3_4,Color3_5,Color3_6,Color3_7
0,Golden Tabby Girl,1,1,50,438a9bdce8ef4d5948fc40e422d34d0d,0,A cute tabby kitten looking for new home. She ...,dae13a47e,7.0,1,...,0,1,0,0,0,0,0,1,0,0
1,Long Hair Calico Girl,2,1,100,438a9bdce8ef4d5948fc40e422d34d0d,0,Adorable local mix Persian kitten looking for ...,fa94b21d1,5.0,1,...,1,0,0,0,0,0,0,1,0,0
2,Shaila's Medium Hair Calico,1,1,50,438a9bdce8ef4d5948fc40e422d34d0d,0,Adorable domestic medium hair kitten looking f...,52782bca8,5.0,0,...,0,1,0,0,0,0,0,0,0,1
3,Shaila's Long Hair Calico,2,1,50,438a9bdce8ef4d5948fc40e422d34d0d,0,Adorable domestic long hair kitten looking for...,056bb97e4,5.0,0,...,0,1,0,0,0,0,0,0,0,1
4,Long Hair Calico,1,1,50,438a9bdce8ef4d5948fc40e422d34d0d,0,Adorable domestic long hair kitten looking for...,6d53b2d19,5.0,1,...,0,1,0,0,0,0,0,0,0,1


In [24]:
transformed_df.to_csv("data/transformed_data.csv")