  ![ImdbIcon](images/pets.jpg)
# Petfinder: Predicting Adoption Speed 

## 1. Imports and Data Cleaning

In [1]:
import petpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
#dogs1 = pd.read_csv('./petfinder_data/last.csv')

In [4]:
#dawg = pd.read_csv('./petfinder_data/last1.csv')

In [5]:
#lastdog = pd.read_csv('./petfinder_data/last2.csv')

In [6]:
#lastdog.shape

In [7]:
#dogs1.shape

In [8]:
#dogs.shape

In [9]:
#dogs1 = pd.concat((dogs1, dawg))

In [10]:
#dogs = pd.concat((dogs1, lastdog))

In [11]:
#dogs.sort_values(by=['published_at'], inplace=True)

In [12]:
#dogs.drop_duplicates(subset=['id'], inplace=True)

In [13]:
#dogs.to_csv('./petfinder_data/dogs.csv', index = False)

In [14]:
dogs = pd.read_csv('./petfinder_data/dogs.csv')

 ______

## Feature Engineering

In [15]:
# Renaming some columns for easier readability/coding

dogs.rename(columns={"attributes.spayed_neutered": "fixed", "attributes.house_trained": "house_trained", 'attributes.special_needs': 'special_needs', 
                      'attributes.shots_current': 'shots_current','environment.children': 'good_with_kids', 'environment.dogs': 'good_with_dogs', 
                      'environment.cats': 'good_with_cats', 'contact.address.city': 'city', 'contact.address.postcode': 'zipcode' }, inplace=True)
dogs.head(1)

Unnamed: 0,id,organization_id,url,type,species,age,gender,size,coat,tags,name,description,organization_animal_id,photos,videos,status,status_changed_at,published_at,distance,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,colors.primary,colors.secondary,colors.tertiary,fixed,house_trained,attributes.declawed,special_needs,shots_current,good_with_kids,good_with_dogs,good_with_cats,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,contact.email,contact.phone,contact.address.address1,contact.address.address2,city,contact.address.state,zipcode,contact.address.country,animal_id,animal_type,organization_id.1,primary_photo_cropped
0,43672066,CA1005,https://www.petfinder.com/dog/spirit-s-litter-...,Dog,Dog,Baby,Male,Medium,Medium,[],Spirit (S Litter),EMAIL: stonecliffeadoption@gmail.com for an a...,,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],adopted,2019-02-10T03:03:57+0000,2019-01-02T03:49:40+0000,16.3248,Labrador Retriever,,True,False,Black,,,True,False,,False,True,,True,,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,contact@stoneclifferescue.org,,,,Dublin,CA,94568.0,US,43672066,dog,ca1005,


In [16]:
# Making sure all cities match up and no duplicates - combining modesto which appeared in two cases
dogs['city']=dogs['city'].replace('modesto','Modesto')

In [17]:
# Converting videos column to 1 if post has a video, 0 if not

dogs['videos'] = np.where(dogs['videos']!= '[]', 1, 0)

In [18]:
# Adding a column, 1 if post has photos, 0 if not

dogs['has_photo'] = np.where(dogs['photos']!= '[]', 1, 0)

In [20]:
dogs['good_with_kids'] = dogs['good_with_kids'].fillna(2)
dogs['good_with_dogs'] = dogs['good_with_dogs'].fillna(2)
dogs['good_with_cats'] = dogs['good_with_cats'].fillna(2)

dogs['fixed'] = (dogs['fixed'] * 1).astype(int)
dogs['house_trained'] = (dogs['house_trained'] * 1).astype(int)
dogs['shots_current'] = (dogs['shots_current'] * 1).astype(int)
dogs['good_with_kids'] = (dogs['good_with_kids'] * 1).astype(int)
dogs['good_with_dogs'] = (dogs['good_with_dogs'] * 1).astype(int)
dogs['good_with_cats'] = (dogs['good_with_cats'] * 1).astype(int)
dogs['special_needs'] = (dogs['special_needs'] * 1).astype(int)

In [None]:
# Dummifying some categorical columns

dogs = pd.get_dummies(dogs, columns=['age', 'gender', 'size', 'coat', 'city'])

In [21]:
# Function to dummify and combine multiclass columns (ie. primary.breed, secondary.breed)

def dummy(df, label):
    cols = [col for col in df if label in col]
    dummy_dict = {}
    main_df = pd.get_dummies(df[cols[0]],prefix = label)
    for col in cols[1:]:
        dummy_dict[col] = pd.get_dummies(df[col],prefix = label)
    for col in dummy_dict:
        main_df = main_df.add(dummy_dict[col], fill_value = 0).gt(0)*1
    return main_df

In [22]:
# Running function for color columns and merging dummified df back to original df

dummy_color_dog = dummy(dogs, 'color') 
dogs = pd.merge(dogs, dummy_color_dog, left_index=True, right_index=True)

In [23]:
# Running function for breed columns and merging dummified df back to original df

dogs.drop(columns=['breeds.unknown'], inplace=True) # All breeds are known or guessed at so dropping this column
dummy_color_dog = dummy(dogs, 'breeds') 
dogs = pd.merge(dogs, dummy_color_dog, left_index=True, right_index=True)

In [24]:
# Dropping columns deemed by me to be irrelevant

dogs.drop(columns=['type', 'species', 'url', 'tags', 'organization_animal_id', 'status', 'breeds.primary', 'breeds.secondary', 'breeds.mixed', 
                   'colors.primary', 'colors.secondary', 'colors.tertiary', 'attributes.declawed', 'primary_photo_cropped.small', 'primary_photo_cropped.medium',
                   'primary_photo_cropped.large', 'primary_photo_cropped.full', 'contact.email', 'contact.phone', 'contact.address.address1',
                   'contact.address.address2', 'contact.address.country', 'contact.address.state', 'animal_id', 'animal_type', 'organization_id.1', 
                   'primary_photo_cropped'], inplace=True)

In [20]:
# Converting columns to datetime

dogs['published_at']= pd.to_datetime(dogs['published_at'])
dogs['status_changed_at']= pd.to_datetime(dogs['status_changed_at'])

In [21]:
# Consructing target column

dogs['days_on_petfinder'] = dogs['status_changed_at'] - dogs['published_at']
dogs['days_on_petfinder'] = dogs['days_on_petfinder'] / np.timedelta64(1, 'D')
dogs['days_on_petfinder'] = dogs['days_on_petfinder'].round()

In [27]:
dogs.dtypes

id                                            int64
organization_id                              object
name                                         object
description                                  object
photos                                       object
                                             ...   
breeds_Wirehaired Terrier                     int64
breeds_Xoloitzcuintli / Mexican Hairless      int64
breeds_Yellow Labrador Retriever              int64
breeds_Yorkshire Terrier                      int64
days_on_petfinder                           float64
Length: 390, dtype: object

**For initial model I am choosing to predict if a dog is adopted within 2 weeks or not**

In [22]:
dogs.loc[dogs['days_on_petfinder'] < 14, 'days_on_petfinder'] = 1

In [23]:
dogs.loc[dogs['days_on_petfinder'] >= 14, 'days_on_petfinder'] = 0

In [24]:
#dogs.to_csv('./petfinder_data/dogs_eda.csv', index = False)

In [59]:
dogs.head()

Unnamed: 0,id,organization_id,name,description,photos,videos,status_changed_at,published_at,distance,fixed,house_trained,special_needs,shots_current,good_with_kids,good_with_dogs,good_with_cats,zipcode,has_photo,age_Adult,age_Baby,age_Senior,age_Young,gender_Female,gender_Male,size_Extra Large,size_Large,size_Medium,size_Small,coat_Curly,coat_Hairless,coat_Long,coat_Medium,coat_Short,coat_Wire,city_ Pleasanton,city_AUBURN,city_Alameda,city_Antelope,city_Antioch,city_Atwater,city_Auburn,city_Benicia,city_Berkeley,city_Bodega,city_Brentwood,city_Burlingame,city_Calistoga,city_Campbell,city_Castro Valley,city_Citrus Heights,city_Clayton,city_Colusa,city_Concord,city_Cotati,city_Cupertino,city_Danville,city_Davis,city_Dublin,city_EL DORADO HILLS,city_El Dorado Hills,city_Elk Grove,city_Escalon,city_Fair Oaks,city_Folsom,city_Forestville,city_Foster City,city_Fremont,city_Gilroy,city_Granite Bay,city_Gualala,city_Guerneville,city_Gustine,city_Half Moon Bay,city_Hayward,city_Herald,city_Hollister,city_Jackson,city_Jamestown,city_Kensington,city_Knightsen,city_La Honda,city_Lafayette,city_Lincoln,city_Lodi,city_Los Banos,city_Lower Lake,city_Manteca,city_Martinez,city_Marysville,city_Menlo Park,city_Millbrae,city_Modesto,city_Modesto.1,city_Monte Rio,city_Monterey,city_Morgan Hill,city_Mountain View,city_NAPA,city_Napa,city_Novato,city_Oakdale,city_Oakland,city_Oakley,city_Olivehurst,city_Orangevale,city_Pacific Grove,city_Pacifica,city_Paicines,city_Palo Alto,city_Patterson,city_Penngrove,city_Petaluma,city_Pittsburg,city_Placerville,city_Pleasant Grove,city_Pleasant Hill,city_Pleasanton,city_Redwood City,city_Redwood Estates,city_Richmond,city_Rocklin,city_Rocklin.1,city_Rohnert Park,city_Roseville,city_SACRAMENTO,city_Sacramento,city_Sacramento.1,city_San Andreas,city_San Francisco,city_San Jose,city_San Martin,city_San Mateo,city_San Pablo,city_San Rafael,city_San Ramon,city_Santa Clara,city_Santa Cruz,city_Santa Rosa,city_Shingle Springs,city_Sonoma,city_Stockton,city_Sunnyvale,city_Tracy,city_Tres Pinos,city_Turlock,city_Vacaville,city_Vallejo,city_Walnut Creek,city_Waterford,city_Watsonville,city_Wilton,city_Windsor,city_Winters,city_Woodland,city_Yuba City,city_los altos,city_napa,city_woodland,color_Apricot / Beige,color_Bicolor,color_Black,color_Brindle,color_Brown / Chocolate,color_Golden,color_Gray / Blue / Silver,color_Harlequin,color_Merle (Blue),color_Merle (Red),color_Red / Chestnut / Orange,color_Sable,"color_Tricolor (Brown, Black, & White)",color_White / Cream,color_Yellow / Tan / Blond / Fawn,breeds_Affenpinscher,breeds_Afghan Hound,breeds_Airedale Terrier,breeds_Akbash,breeds_Akita,breeds_Alaskan Malamute,breeds_American Bulldog,breeds_American Bully,breeds_American Eskimo Dog,breeds_American Foxhound,breeds_American Hairless Terrier,breeds_American Staffordshire Terrier,breeds_Anatolian Shepherd,breeds_Aussiedoodle,breeds_Australian Cattle Dog / Blue Heeler,breeds_Australian Kelpie,breeds_Australian Shepherd,breeds_Australian Terrier,breeds_Basenji,breeds_Basset Hound,breeds_Beagle,breeds_Bearded Collie,breeds_Beauceron,breeds_Bedlington Terrier,breeds_Belgian Shepherd / Malinois,breeds_Belgian Shepherd / Sheepdog,breeds_Belgian Shepherd / Tervuren,breeds_Bernese Mountain Dog,breeds_Bichon Frise,breeds_Black Labrador Retriever,breeds_Black Mouth Cur,breeds_Black Russian Terrier,breeds_Black and Tan Coonhound,breeds_Bloodhound,breeds_Blue Lacy,breeds_Bluetick Coonhound,breeds_Boerboel,breeds_Bolognese,breeds_Border Collie,breeds_Border Terrier,breeds_Boston Terrier,breeds_Bouvier des Flandres,breeds_Boxer,breeds_Briard,breeds_Brittany Spaniel,breeds_Brussels Griffon,breeds_Bull Terrier,breeds_Bullmastiff,breeds_Cairn Terrier,breeds_Canaan Dog,breeds_Cane Corso,breeds_Cardigan Welsh Corgi,breeds_Carolina Dog,breeds_Catahoula Leopard Dog,breeds_Cattle Dog,breeds_Cavalier King Charles Spaniel,breeds_Chesapeake Bay Retriever,breeds_Chihuahua,breeds_Chinese Crested Dog,breeds_Chinook,breeds_Chiweenie,breeds_Chocolate Labrador Retriever,breeds_Chow Chow,breeds_Clumber Spaniel,breeds_Cockapoo,breeds_Cocker Spaniel,breeds_Collie,breeds_Coonhound,breeds_Corgi,breeds_Coton de Tulear,breeds_Dachshund,breeds_Dalmatian,breeds_Doberman Pinscher,breeds_Dogo Argentino,breeds_Dogue de Bordeaux,breeds_Dutch Shepherd,breeds_English Bulldog,breeds_English Cocker Spaniel,breeds_English Foxhound,breeds_English Pointer,breeds_English Setter,breeds_English Shepherd,breeds_English Springer Spaniel,breeds_Eskimo Dog,breeds_False,breeds_Finnish Spitz,breeds_Flat-Coated Retriever,breeds_Fox Terrier,breeds_Foxhound,breeds_French Bulldog,breeds_German Shepherd Dog,breeds_German Shorthaired Pointer,breeds_German Spitz,breeds_German Wirehaired Pointer,breeds_Giant Schnauzer,breeds_Golden Retriever,breeds_Goldendoodle,breeds_Great Dane,breeds_Great Pyrenees,breeds_Greater Swiss Mountain Dog,breeds_Greyhound,breeds_Harrier,breeds_Havanese,breeds_Hound,breeds_Husky,breeds_Ibizan Hound,breeds_Irish Setter,breeds_Irish Terrier,breeds_Irish Wolfhound,breeds_Italian Greyhound,breeds_Jack Russell Terrier,breeds_Japanese Chin,breeds_Jindo,breeds_Keeshond,breeds_Kishu,breeds_Klee Kai,breeds_Labradoodle,breeds_Labrador Retriever,breeds_Lakeland Terrier,breeds_Lancashire Heeler,breeds_Leonberger,breeds_Lhasa Apso,breeds_Maltese,breeds_Maltipoo,breeds_Manchester Terrier,breeds_Maremma Sheepdog,breeds_Mastiff,breeds_McNab,breeds_Miniature Dachshund,breeds_Miniature Pinscher,breeds_Miniature Poodle,breeds_Miniature Schnauzer,breeds_Mixed Breed,breeds_Morkie,breeds_Mountain Cur,breeds_Mountain Dog,breeds_Neapolitan Mastiff,breeds_Newfoundland Dog,breeds_Norfolk Terrier,breeds_Norwich Terrier,breeds_Nova Scotia Duck Tolling Retriever,breeds_Old English Sheepdog,breeds_Otterhound,breeds_Papillon,breeds_Parson Russell Terrier,breeds_Patterdale Terrier / Fell Terrier,breeds_Pekingese,breeds_Pembroke Welsh Corgi,breeds_Peruvian Inca Orchid,breeds_Petit Basset Griffon Vendeen,breeds_Pharaoh Hound,breeds_Pit Bull Terrier,breeds_Plott Hound,breeds_Pointer,breeds_Pomeranian,breeds_Pomsky,breeds_Poodle,breeds_Portuguese Podengo,breeds_Portuguese Water Dog,breeds_Presa Canario,breeds_Pug,breeds_Puggle,breeds_Puli,breeds_Pyrenean Shepherd,breeds_Rat Terrier,breeds_Redbone Coonhound,breeds_Retriever,breeds_Rhodesian Ridgeback,breeds_Rottweiler,breeds_Rough Collie,breeds_Saint Bernard,breeds_Samoyed,breeds_Schipperke,breeds_Schnauzer,breeds_Schnoodle,breeds_Scottish Terrier,breeds_Setter,breeds_Shar-Pei,breeds_Sheep Dog,breeds_Shepherd,breeds_Shetland Sheepdog / Sheltie,breeds_Shiba Inu,breeds_Shih Tzu,breeds_Shih poo,breeds_Siberian Husky,breeds_Silky Terrier,breeds_Skye Terrier,breeds_Smooth Collie,breeds_Smooth Fox Terrier,breeds_Spaniel,breeds_Spitz,breeds_Staffordshire Bull Terrier,breeds_Standard Poodle,breeds_Standard Schnauzer,breeds_Swedish Vallhund,breeds_Terrier,breeds_Tibetan Mastiff,breeds_Tibetan Spaniel,breeds_Tibetan Terrier,breeds_Toy Fox Terrier,breeds_Treeing Walker Coonhound,breeds_True,breeds_Vizsla,breeds_Weimaraner,breeds_Welsh Terrier,breeds_West Highland White Terrier / Westie,breeds_Wheaten Terrier,breeds_Whippet,breeds_White German Shepherd,breeds_Wire Fox Terrier,breeds_Wirehaired Dachshund,breeds_Wirehaired Pointing Griffon,breeds_Wirehaired Terrier,breeds_Xoloitzcuintli / Mexican Hairless,breeds_Yellow Labrador Retriever,breeds_Yorkshire Terrier,days_on_petfinder,polarity,desc_len
0,43672066,CA1005,Spirit (S Litter),EMAIL: stonecliffeadoption@gmail.com for an a...,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,0,2019-02-10 03:03:57+00:00,2019-01-02 03:49:40+00:00,16.3248,1,0,0,1,2,1,2,94568.0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.1779,169
1,43672069,CA1005,Tachi,EMAIL: stonecliffeadoption@gmail.com for an a...,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,0,2019-03-10 05:20:18+00:00,2019-01-02 03:57:32+00:00,16.3248,1,0,0,1,2,1,2,94568.0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.4588,196
2,43672079,CA1005,Walley ( W Litter pup ),EMAIL: stonecliffeadoption@gmail.com for an a...,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,0,2019-02-10 03:04:27+00:00,2019-01-02 04:02:15+00:00,16.3248,1,0,0,1,2,1,2,94568.0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,168
3,43672084,CA1005,Wesley ( W litter pup ),EMAIL: stonecliffeadoption@gmail.com for an a...,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,0,2019-01-20 00:45:54+00:00,2019-01-02 04:06:50+00:00,16.3248,1,0,0,1,2,1,2,94568.0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.5106,170
4,43672090,CA1005,"Wilbur *Adoption in process JL""",EMAIL: stonecliffeadoption@gmail.com for an a...,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,0,2019-03-24 03:42:52+00:00,2019-01-02 04:11:05+00:00,16.3248,1,0,0,1,2,1,2,94568.0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,175


In [30]:
# classes decently balanced

dogs.days_on_petfinder.value_counts()

1.0    10646
0.0     9348
Name: days_on_petfinder, dtype: int64

In [31]:
# Creating a list of my categorical columns for later use in modeling
categorical = ['id', 'organization_id', 'name', 'description', 'photos', 'status_changed_at', 'published_at', 'zipcode', 'days_on_petfinder']

In [32]:
# Creating a df of numerical columns for later use in modeling
numerical = dogs.drop(categorical, axis=1)

In [33]:
#plt.figure(figsize=(15,30))
#sns.heatmap(numerical.corr()[['days_on_petfinder']].sort_values(by='days_on_petfinder', ascending=True),
           # vmin = -1, vmax=1,
           # cmap ='winter',
           # annot = True,);
#plt.title("Correlation With Adoption Speed - Cats", size = 20);

### Sentiment Analysis

In [46]:
dogs = dogs[dogs['description'].notna()]

In [47]:
# Converting the body of my comments to a list in order to feed it into my function below

desc_list = dogs['description'].tolist()

In [48]:
# Instantiating sentiment intensity analyzer

analyzer = SentimentIntensityAnalyzer()

In [49]:
def get_polarity(desc_list):
    polarity = []
    for post in desc_list:
        vs = analyzer.polarity_scores(post) # Analyzing polarity scores by individual description
        polarity.append(vs['compound']) # Appending polarity scores to my list- polarity
    return polarity

In [50]:
polarity = get_polarity(desc_list)

In [51]:
# Adding polarity column
dogs['polarity'] = polarity

In [52]:
# Adding description length column
dogs['desc_len'] = [len(x) for x in dogs['description']]

-------

## Logreg

In [53]:
# Dropping columns from data frame and dummifying categorical columns
X = dogs.drop(columns=categorical)
y = dogs['days_on_petfinder']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42)

In [54]:
%%time
lr = LogisticRegression(max_iter=5000)
lr.fit(X_train, y_train)
lr_train = lr.score(X_train, y_train)
lr_test = lr.score(X_test, y_test)

CPU times: user 1min, sys: 348 ms, total: 1min
Wall time: 10.2 s


In [55]:
print(f'train score: {lr_train}')
print(f'test score: {lr_test}')

train score: 0.6959628438728117
test score: 0.6909484914152358


In [56]:
# Baseline
dogs['days_on_petfinder'].value_counts(normalize=True)

1.0    0.53246
0.0    0.46754
Name: days_on_petfinder, dtype: float64

In [57]:
y_pred=lr.predict(X_test)
y_pred[0:20]

array([1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0.,
       1., 0., 0.])

In [58]:
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)

Confusion Matrix

[[1770 1035]
 [ 819 2375]]


-------

## TFIDF

**Feature Engineering**

In [None]:
# Dropping 489 columns with no description - may replace with none?
#dogs = dogs[dogs['description'].notna()]

In [38]:
dogs['description'].fillna('None', inplace=True)

In [39]:
X = dogs['description']
y = dogs['days_on_petfinder']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    stratify=y,
                                                    random_state=42)

In [41]:
tvec = TfidfVectorizer()

In [42]:
# Instantiating a pipeline, and specifying Multinomial Naive Bayes as the estimator, and CountVectorizer as the transformer.
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Evaluating how my model will perform on unseen data.
print(f'Cross-Val Score   = {cross_val_score(pipe, X_train, y_train, cv = 5).mean()}') 

# Fitting my model.
pipe.fit(X_train, y_train)

# Looking at training and testing scores for my pipeline.
print(f'Training Accuracy = {pipe.score(X_train, y_train)}')
print(f'Testing Accuracy  = {pipe.score(X_test, y_test)}')

# Defining my list of cvec hyperparameters to test in my gridsearch.
pipe_params = {
    # Defining max numbers of features to be fit.
    'cvec__max_features': [5_000, 10_000, 15_000, 20_000, 25_000],
    # Specifying that a token must occur a minumum of 2, 3, or 5 times to be factored into the model.
    'cvec__min_df': [2, 3, 5],
    # Specifying no stop words, English stop words, and my own stop words (defined above).
    'cvec__stop_words': [None, 'english'],
    # Checking individual tokens, and individual tokens and 2-grams.
    'cvec__ngram_range': [(1,1), (1,2)]
}

# Instantiating GridSearchCV on my cvec-transformed data.
gs = GridSearchCV(pipe, 
                  param_grid = pipe_params,
                  # 5-fold cross-validation.
                  cv = 5,                   
                  # Defining n_jobs to be 12 so my gridsearch runs faster! I have 8 cores, so I can safely run up to 16 threads, but
                  # because there are dimishing returns as I increase the number, I'm setting it at a slightly more modest 12.
                  n_jobs = 6) 

Cross-Val Score   = 0.6320828867452661
Training Accuracy = 0.7636298678099321
Testing Accuracy  = 0.6441073512252042


In [43]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=6,
             param_grid={'cvec__max_features': [5000, 10000, 15000, 20000,
                                                25000],
                         'cvec__min_df': [2, 3, 5],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'cvec__stop_words': [None, 'english']})

In [44]:
# Looking at the best parameters my gridsearch chose for this model.
gs.best_params_

{'cvec__max_features': 25000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [45]:
gs.best_score_

0.6519471239728475