  ![ImdbIcon](images/pets.jpg)
# Petfinder: Predicting Adoption Speed 

## 1. Imports and Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
df = pd.read_csv('./petfinder_data/train.csv')
color_labels = pd.read_csv('./petfinder_data/color_labels.csv')
breed_labels = pd.read_csv('./petfinder_data/breed_labels.csv')

In [3]:
#pd.set_option('display.max_rows', None)

In [4]:
df.shape

(11565, 23)

In [5]:
#df.isnull().sum()

In [6]:
#df = df[df['Quantity'] == 1]
df.shape

(11565, 23)

In [7]:
#df['AdoptionSpeed']=df['AdoptionSpeed'].replace(0,1)

In [8]:
#df.drop(columns = ['Quantity'], inplace=True)

In [9]:
#df.to_csv('./petfinder_data/train.csv', index = False)

__________

## 2. Feature Engineering

In [10]:
# Dummify and combine features like Color and Breed

def dummy(df, label):
    cols = [col for col in df if label in col]
    dummy_dict = {}
    main_df = pd.get_dummies(df[cols[0]],prefix = label)
    for col in cols[1:]:
        dummy_dict[col] = pd.get_dummies(df[col],prefix = label)
    for col in dummy_dict:
        main_df = main_df.add(dummy_dict[col], fill_value = 0).gt(0)*1
    return main_df

In [11]:
# If dogs have been vaccinated: 1 Not vaccinated: 0, Not sure: converting to No
df['Vaccinated']=df['Vaccinated'].replace(2,0)
df['Vaccinated']=df['Vaccinated'].replace(3,0)

# If dogs have been dewormed: 1 Not dewormed: 0, Not sure: converting to No
df['Dewormed']=df['Dewormed'].replace(2,0)
df['Dewormed']=df['Dewormed'].replace(3,0)

# Binarizing gender: 1:male 0:female
df['Gender']=df['Gender'].replace(2,0)

In [12]:
# Dropping 9 rows with no Description

df.dropna(subset=['Description'], inplace=True)

In [13]:
# Adding description length column

df['desc_len'] = [len(x) for x in df['Description']]

------

### - Dogs

In [14]:
dogs = df[df['Type'] == 1]
dogs.shape

(6621, 24)

In [15]:
# Running function and merging dummified df back to original df

dummy_color_dog = dummy(dogs, 'Color') 
dogs = pd.merge(dogs, dummy_color_dog, left_index=True, right_index=True)

In [16]:
dummy_breed_dog = dummy(dogs, 'Breed')
dogs=pd.merge(dogs, dummy_breed_dog, left_index=True, right_index=True)

In [17]:
def breeds(df,label):
    breeds = [col for col in df if label in col]
    return breeds

#breeds(dogs, 'Breed')
#Get list of columns that contain the word 'breed' to confirm all are dog breeds (not cat)

In [18]:
pd.get_dummies(dogs, columns=['FurLength', 'Dewormed', 'Health', 'State', 'MaturitySize'])
dogs.head(1)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Breed_81,Breed_82,Breed_83,Breed_85,Breed_88,Breed_93,Breed_96,Breed_97,Breed_98,Breed_99
2,1,Brisco,1,307,0,1,2,7,0,2,...,0,0,0,0,0,0,0,0,0,0


In [19]:
dogs.drop(columns=['Breed1', 'Breed2', 'Breed_0', 'Color_0', 'Color1', 'Color2', 'Color3', 'FurLength', 
                   'Dewormed', 'Health', 'State', 'MaturitySize'], inplace=True)

**Sentiment Analysis**

In [20]:
# Converting the body of my comments to a list in order to feed it into my function below

desc_list = dogs['Description'].tolist()

In [21]:
# Instantiating sentiment intensity analyzer

analyzer = SentimentIntensityAnalyzer()

In [22]:
def get_polarity(desc_list):
    polarity = []
    for post in desc_list:
        vs = analyzer.polarity_scores(post) # Analyzing polarity scores by individual description
        polarity.append(vs['compound']) # Appending polarity scores to my list- polarity
    return polarity

In [23]:
polarity = get_polarity(desc_list)

In [24]:
dogs['Polarity'] = polarity

In [25]:
# Moving position of polarity column in dataframe to the index right next to the description column for better comparison
pol_col = dogs['Polarity']
dogs.drop(labels=['Polarity'], axis=1, inplace = True)
dogs.insert(14, 'Polarity', pol_col)
dogs.head(1)

Unnamed: 0,Type,Name,Age,Gender,Vaccinated,Sterilized,Fee,RescuerID,VideoAmt,Description,...,Breed_81,Breed_82,Breed_83,Breed_85,Breed_88,Breed_93,Breed_96,Breed_97,Breed_98,Breed_99
2,1,Brisco,1,1,1,2,0,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,...,0,0,0,0,0,0,0,0,0,0


In [26]:
dogs.to_csv('./petfinder_data/dogs.csv', index = False)

-----------------

## - Cats

In [27]:
cats = df[df['Type'] == 2]
cats.shape

(4935, 24)

In [28]:
dummy_color_cat = dummy(cats, 'Color') 
dummy_color_cat.drop(columns=['Color_7'], inplace=True)
cats = pd.merge(cats, dummy_color_cat, left_index=True, right_index=True)

In [29]:
dummy_breed_cat = dummy(cats, 'Breed')
cats=pd.merge(cats, dummy_breed_cat, left_index=True, right_index=True)

In [30]:
pd.get_dummies(cats, columns=['FurLength', 'Dewormed', 'Health', 'State'])
cats.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Fee', 'State', 'RescuerID', 'VideoAmt',
       'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed', 'desc_len',
       'Color_0', 'Color_1', 'Color_2', 'Color_3', 'Color_4', 'Color_5',
       'Color_6', 'Breed_0', 'Breed_15', 'Breed_218', 'Breed_241', 'Breed_242',
       'Breed_243', 'Breed_244', 'Breed_245', 'Breed_246', 'Breed_247',
       'Breed_248', 'Breed_249', 'Breed_25', 'Breed_250', 'Breed_251',
       'Breed_252', 'Breed_253', 'Breed_254', 'Breed_256', 'Breed_257',
       'Breed_260', 'Breed_262', 'Breed_263', 'Breed_264', 'Breed_265',
       'Breed_266', 'Breed_267', 'Breed_268', 'Breed_269', 'Breed_270',
       'Breed_271', 'Breed_272', 'Breed_273', 'Breed_274', 'Breed_276',
       'Breed_277', 'Breed_278', 'Breed_279', 'Breed_280', 'Breed_281',
       'Breed_282', 'Breed_283', 'Breed_284', 'B

In [31]:
cats.drop(columns=['MaturitySize', 'Color1', 'Color2', 'Color3', 'Breed1', 'Breed2', 'Breed_15', 'Breed_218', 
                   'Breed_0', 'Breed_5', 'Breed_70', 'Breed_307', 'Color_0', 'FurLength', 'Dewormed', 'Health', 'State'], inplace=True)

**Sentiment Analysis**

In [32]:
desc_list = cats['Description'].tolist()

In [33]:
def get_polarity(desc_list):
    polarity = []
    for post in desc_list:
        vs = analyzer.polarity_scores(post) # Analyzing polarity scores by individual description
        polarity.append(vs['compound']) # Appending polarity scores to my list- polarity
    return polarity

In [34]:
polarity = get_polarity(desc_list)

In [35]:
cats['Polarity'] = polarity

In [36]:
# Moving position of polarity column in dataframe to the index right next to the description column for better comparison
pol_col = cats['Polarity']
cats.drop(labels=['Polarity'], axis=1, inplace = True)
cats.insert(14, 'Polarity', pol_col)
cats.head(1)

Unnamed: 0,Type,Name,Age,Gender,Vaccinated,Sterilized,Fee,RescuerID,VideoAmt,Description,...,Breed_297,Breed_298,Breed_299,Breed_300,Breed_301,Breed_302,Breed_303,Breed_304,Breed_305,Breed_306
0,2,Nibble,3,1,0,2,100,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,...,0,0,1,0,0,0,0,0,0,0


In [37]:
cats.to_csv('./petfinder_data/cats.csv', index = False)