 # Petfinder: Northern California Adoption Rate
 ![Icon](images/pets.jpg)

### Notebook Contents

### Imports

In [1]:
import petpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

In [2]:
pd.set_option('display.max_columns', 500)

In [14]:
dogs = pd.read_csv('./petfinder_data/dogs.csv')

 ______

### Petfinder API + Petpy Wrapper

In [None]:
# Using petpy wrapper with petfinder API keys (removed my personal key + secret)
#pf = petpy.Petfinder(key='inputkeyhere', secret='inputsecrethere')

In [None]:
# dogs1 = pf.animals(animal_type='dog', status='adopted', results_per_page=50, pages=200, location=94602, 
#                    before_date = '2021-01-01, after_date='2018-12-31', return_df=True)

- petfinder would not allow me to pull two years worth of data at once, so this required pulling the maximum, finding the end date down to the minute, re-inputting that into my pull request before date and running again as dogs2 and finally concatenating those df's as shown below

In [None]:
#dogs1.sort_values(by=['published_at'], inplace=True)

In [None]:
#dogs1.head()

In [None]:
# Had to make multiple requests, so concatenating them
#dogs = pd.concat((dogs1, dogs2))

## Feature Engineering

In [None]:
# Dropping columns deemed by me to be irrelevant
dogs.drop(columns=['type', 'species', 'url', 'tags', 'organization_animal_id', 'status', 'breeds.primary', 'breeds.secondary', 'breeds.mixed', 
                   'colors.primary', 'colors.secondary', 'colors.tertiary', 'attributes.declawed', 'primary_photo_cropped.small', 'primary_photo_cropped.medium',
                   'primary_photo_cropped.large', 'primary_photo_cropped.full', 'contact.email', 'contact.phone', 'contact.address.address1',
                   'contact.address.address2', 'contact.address.country', 'contact.address.state', 'animal_id', 'animal_type', 'organization_id.1', 
                   'primary_photo_cropped'], inplace=True)

In [15]:
# Renaming some columns for easier readability/coding
dogs.rename(columns={"attributes.spayed_neutered": "fixed", 
                     "attributes.house_trained": "house_trained", 
                     'attributes.special_needs': 'special_needs', 
                      'attributes.shots_current': 'shots_current',
                     'environment.children': 'good_with_kids', 
                     'environment.dogs': 'good_with_dogs', 
                      'environment.cats': 'good_with_cats', 
                     'contact.address.city': 'city', 
                     'contact.address.postcode': 'zipcode' }, inplace=True)

Unnamed: 0,id,organization_id,url,type,species,age,gender,size,coat,tags,name,description,organization_animal_id,photos,videos,status,status_changed_at,published_at,distance,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,colors.primary,colors.secondary,colors.tertiary,fixed,house_trained,attributes.declawed,special_needs,shots_current,good_with_kids,good_with_dogs,good_with_cats,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,contact.email,contact.phone,contact.address.address1,contact.address.address2,city,contact.address.state,zipcode,contact.address.country,animal_id,animal_type,organization_id.1,primary_photo_cropped
0,43672066,CA1005,https://www.petfinder.com/dog/spirit-s-litter-...,Dog,Dog,Baby,Male,Medium,Medium,[],Spirit (S Litter),EMAIL: stonecliffeadoption@gmail.com for an a...,,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],adopted,2019-02-10T03:03:57+0000,2019-01-02T03:49:40+0000,16.3248,Labrador Retriever,,True,False,Black,,,True,False,,False,True,,True,,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,contact@stoneclifferescue.org,,,,Dublin,CA,94568.0,US,43672066,dog,ca1005,


In [16]:
# Making sure all cities match up and no duplicates - combining modesto which appeared in two cases
dogs['city']=dogs['city'].replace('modesto','Modesto')

In [17]:
# Converting videos column to 1 if post has a video, else 0
dogs['videos'] = np.where(dogs['videos']!= '[]', 1, 0)

In [18]:
# Adding a column, 1 if post has photos, else 0
dogs['has_photo'] = np.where(dogs['photos']!= '[]', 1, 0)

In [20]:
# Replacing NA with 2 if the following features are unknown
dogs['good_with_kids'] = dogs['good_with_kids'].fillna(2)
dogs['good_with_dogs'] = dogs['good_with_dogs'].fillna(2)
dogs['good_with_cats'] = dogs['good_with_cats'].fillna(2)

# Converting from boolean to integers
dogs['fixed'] = (dogs['fixed'] * 1).astype(int)
dogs['house_trained'] = (dogs['house_trained'] * 1).astype(int)
dogs['shots_current'] = (dogs['shots_current'] * 1).astype(int)
dogs['good_with_kids'] = (dogs['good_with_kids'] * 1).astype(int)
dogs['good_with_dogs'] = (dogs['good_with_dogs'] * 1).astype(int)
dogs['good_with_cats'] = (dogs['good_with_cats'] * 1).astype(int)
dogs['special_needs'] = (dogs['special_needs'] * 1).astype(int)

### -Dummies

In [21]:
# Function to dummify and combine multiclass columns (ie. primary.breed, secondary.breed)
def dummy(df, label):
    cols = [col for col in df if label in col]
    dummy_dict = {}
    main_df = pd.get_dummies(df[cols[0]],prefix = label)
    for col in cols[1:]:
        dummy_dict[col] = pd.get_dummies(df[col],prefix = label)
    for col in dummy_dict:
        main_df = main_df.add(dummy_dict[col], fill_value = 0).gt(0)*1
    return main_df

In [22]:
# Running function for color columns and merging dummified df back to original df

dummy_color_dog = dummy(dogs, 'color') 
dogs = pd.merge(dogs, dummy_color_dog, left_index=True, right_index=True)

In [23]:
# Running function for breed columns and merging dummified df back to original df

dogs.drop(columns=['breeds.unknown'], inplace=True) # All breeds are known or guessed at so dropping this column
dummy_color_dog = dummy(dogs, 'breeds') 
dogs = pd.merge(dogs, dummy_color_dog, left_index=True, right_index=True)

In [None]:
# Dummifying some categorical columns
dogs = pd.get_dummies(dogs, columns=['age', 'gender', 'size', 'coat', 'city'])

### -Target Column

In [20]:
# Converting columns to datetime
dogs['published_at']= pd.to_datetime(dogs['published_at'])
dogs['status_changed_at']= pd.to_datetime(dogs['status_changed_at'])

In [21]:
# Consructing target from datetime columns
dogs['days_on_petfinder'] = dogs['status_changed_at'] - dogs['published_at']
dogs['days_on_petfinder'] = dogs['days_on_petfinder'] / np.timedelta64(1, 'D')
dogs['days_on_petfinder'] = dogs['days_on_petfinder'].round()

**Dog adopted in under two weeks: 1, over two weeks: 0**

In [22]:
dogs.loc[dogs['days_on_petfinder'] < 14, 'days_on_petfinder'] = 1

In [23]:
dogs.loc[dogs['days_on_petfinder'] >= 14, 'days_on_petfinder'] = 0

In [30]:
# classes decently balanced
dogs.days_on_petfinder.value_counts()

1.0    10646
0.0     9348
Name: days_on_petfinder, dtype: int64

### Sentiment Analysis

In [49]:
# Adding a compound polarity score column
dogs = dogs[dogs['description'].notna()]

desc_list = dogs['description'].tolist()

analyzer = SentimentIntensityAnalyzer()

def get_polarity(desc_list):
    polarity = []
    for post in desc_list:
        vs = analyzer.polarity_scores(post)
        polarity.append(vs['compound']) 
    return polarity

polarity = get_polarity(desc_list)

dogs['polarity'] = polarity

In [52]:
# Adding description length column
dogs['desc_len'] = [len(x) for x in dogs['description']]

### -Finalizing DataFrame

In [None]:
# Creating a list of my categorical columns for later use in modeling
categorical = ['id', 'organization_id', 'name', 'description', 'photos', 'status_changed_at', 'published_at', 'zipcode', 'days_on_petfinder']

In [None]:
# Creating a df of numerical columns for later use in modeling
numerical = dogs.drop(categorical, axis=1)

In [None]:
dogs

In [None]:
#dogs.to_csv('./petfinder_data/dogs.csv', index = False)

-------

## Logreg

In [53]:
# Dropping columns from data frame and dummifying categorical columns
X = dogs.drop(columns=categorical)
y = dogs['days_on_petfinder']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42)

In [54]:
%%time
lr = LogisticRegression(max_iter=5000)
lr.fit(X_train, y_train)
lr_train = lr.score(X_train, y_train)
lr_test = lr.score(X_test, y_test)

CPU times: user 1min, sys: 348 ms, total: 1min
Wall time: 10.2 s


In [55]:
print(f'train score: {lr_train}')
print(f'test score: {lr_test}')

train score: 0.6959628438728117
test score: 0.6909484914152358


In [56]:
# Baseline
dogs['days_on_petfinder'].value_counts(normalize=True)

1.0    0.53246
0.0    0.46754
Name: days_on_petfinder, dtype: float64

In [57]:
y_pred=lr.predict(X_test)
y_pred[0:20]

array([1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0.,
       1., 0., 0.])

In [58]:
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)

Confusion Matrix

[[1770 1035]
 [ 819 2375]]


-------

## TFIDF

**Feature Engineering**

In [None]:
# Dropping 489 columns with no description - may replace with none?
#dogs = dogs[dogs['description'].notna()]

In [38]:
dogs['description'].fillna('None', inplace=True)

In [39]:
X = dogs['description']
y = dogs['days_on_petfinder']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    stratify=y,
                                                    random_state=42)

In [41]:
tvec = TfidfVectorizer()

In [42]:
# Instantiating a pipeline, and specifying Multinomial Naive Bayes as the estimator, and CountVectorizer as the transformer.
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Evaluating how my model will perform on unseen data.
print(f'Cross-Val Score   = {cross_val_score(pipe, X_train, y_train, cv = 5).mean()}') 

# Fitting my model.
pipe.fit(X_train, y_train)

# Looking at training and testing scores for my pipeline.
print(f'Training Accuracy = {pipe.score(X_train, y_train)}')
print(f'Testing Accuracy  = {pipe.score(X_test, y_test)}')

# Defining my list of cvec hyperparameters to test in my gridsearch.
pipe_params = {
    # Defining max numbers of features to be fit.
    'cvec__max_features': [5_000, 10_000, 15_000, 20_000, 25_000],
    # Specifying that a token must occur a minumum of 2, 3, or 5 times to be factored into the model.
    'cvec__min_df': [2, 3, 5],
    # Specifying no stop words, English stop words, and my own stop words (defined above).
    'cvec__stop_words': [None, 'english'],
    # Checking individual tokens, and individual tokens and 2-grams.
    'cvec__ngram_range': [(1,1), (1,2)]
}

# Instantiating GridSearchCV on my cvec-transformed data.
gs = GridSearchCV(pipe, 
                  param_grid = pipe_params,
                  # 5-fold cross-validation.
                  cv = 5,                   
                  # Defining n_jobs to be 12 so my gridsearch runs faster! I have 8 cores, so I can safely run up to 16 threads, but
                  # because there are dimishing returns as I increase the number, I'm setting it at a slightly more modest 12.
                  n_jobs = 6) 

Cross-Val Score   = 0.6320828867452661
Training Accuracy = 0.7636298678099321
Testing Accuracy  = 0.6441073512252042


In [43]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=6,
             param_grid={'cvec__max_features': [5000, 10000, 15000, 20000,
                                                25000],
                         'cvec__min_df': [2, 3, 5],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'cvec__stop_words': [None, 'english']})

In [44]:
# Looking at the best parameters my gridsearch chose for this model.
gs.best_params_

{'cvec__max_features': 25000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [45]:
gs.best_score_

0.6519471239728475