## ML Project

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

In [2]:
file = "aac_shelter_outcomes.csv"

In [3]:
df = pd.read_csv(file)

In [4]:
df['animal_type'].value_counts()

Dog          44242
Cat          29422
Other         4249
Bird           334
Livestock        9
Name: animal_type, dtype: int64

In [5]:
df.columns

Index(['age_upon_outcome', 'animal_id', 'animal_type', 'breed', 'color',
       'date_of_birth', 'datetime', 'monthyear', 'name', 'outcome_subtype',
       'outcome_type', 'sex_upon_outcome'],
      dtype='object')

In [6]:
df = df.loc[(df['animal_type'] == 'Dog') | (df['animal_type'] == 'Cat')]

In [7]:
dog_df = df.loc[df['animal_type'] == 'Dog']

In [8]:
len(dog_df['breed'].unique())

1893

In [9]:
cat_df = df.loc[df['animal_type'] == 'Cat']

In [10]:
len(cat_df['breed'].unique())

79

In [11]:
table = pd.pivot_table(dog_df, values='animal_id', index=['breed'], aggfunc='count')
table = table.reset_index()
table = table.rename(columns={"animal_id": "count"})
table = table.sort_values(by='count', ascending=False)

In [12]:
new_table = table.head(10)
top_10_breeds = new_table['breed'].to_list()
top_10_breeds

['Pit Bull Mix',
 'Chihuahua Shorthair Mix',
 'Labrador Retriever Mix',
 'German Shepherd Mix',
 'Australian Cattle Dog Mix',
 'Dachshund Mix',
 'Boxer Mix',
 'Miniature Poodle Mix',
 'Border Collie Mix',
 'Catahoula Mix']

In [13]:
dog_df.columns

Index(['age_upon_outcome', 'animal_id', 'animal_type', 'breed', 'color',
       'date_of_birth', 'datetime', 'monthyear', 'name', 'outcome_subtype',
       'outcome_type', 'sex_upon_outcome'],
      dtype='object')

In [14]:
dog_df['outcome_subtype'].value_counts()

Partner                9516
Foster                 2162
Suffering               684
Aggressive              501
Offsite                 254
Behavior                142
In Kennel                87
Rabies Risk              85
Medical                  60
At Vet                   23
In Foster                23
Court/Investigation      18
In Surgery                7
Possible Theft            7
Enroute                   4
Barn                      1
Snr                       1
Name: outcome_subtype, dtype: int64

In [15]:
dog_df.head()

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male
5,4 months,A664462,Dog,Leonberger Mix,Brown/White,2013-06-03T00:00:00,2013-10-07T13:06:00,2013-10-07T13:06:00,*Edgar,Partner,Transfer,Intact Male
7,3 years,A692618,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23T00:00:00,2014-12-08T15:55:00,2014-12-08T15:55:00,*Ella,Partner,Transfer,Spayed Female


In [16]:
dog_df['date_of_birth'] = dog_df['date_of_birth'].str.split('T', n=1, expand=True)
dog_df['datetime'] = dog_df['datetime'].str.split('T', n=1, expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
dog_df.head()

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06,2013-11-07,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31,2014-06-03,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02,2014-06-15,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male
5,4 months,A664462,Dog,Leonberger Mix,Brown/White,2013-06-03,2013-10-07,2013-10-07T13:06:00,*Edgar,Partner,Transfer,Intact Male
7,3 years,A692618,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23,2014-12-08,2014-12-08T15:55:00,*Ella,Partner,Transfer,Spayed Female


In [18]:
dog_df['date_of_birth'] = pd.to_datetime(dog_df['date_of_birth'])
dog_df['datetime'] = pd.to_datetime(dog_df['datetime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
dog_df.dtypes

age_upon_outcome            object
animal_id                   object
animal_type                 object
breed                       object
color                       object
date_of_birth       datetime64[ns]
datetime            datetime64[ns]
monthyear                   object
name                        object
outcome_subtype             object
outcome_type                object
sex_upon_outcome            object
dtype: object

In [20]:
from datetime import date
# dog_df['age'] = dog_df['datetime'] - dog_df['date_of_birth']

In [21]:
dog_df.head()

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06,2013-11-07,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31,2014-06-03,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02,2014-06-15,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male
5,4 months,A664462,Dog,Leonberger Mix,Brown/White,2013-06-03,2013-10-07,2013-10-07T13:06:00,*Edgar,Partner,Transfer,Intact Male
7,3 years,A692618,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23,2014-12-08,2014-12-08T15:55:00,*Ella,Partner,Transfer,Spayed Female


In [22]:
import numpy as np
dog_df['age_months'] = ((dog_df.datetime - dog_df.date_of_birth)/np.timedelta64(1, 'M'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
dog_df.head()

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome,age_months
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06,2013-11-07,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female,12.024888
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31,2014-06-03,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male,14.094745
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02,2014-06-15,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male,108.421117
5,4 months,A664462,Dog,Leonberger Mix,Brown/White,2013-06-03,2013-10-07,2013-10-07T13:06:00,*Edgar,Partner,Transfer,Intact Male,4.139715
7,3 years,A692618,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23,2014-12-08,2014-12-08T15:55:00,*Ella,Partner,Transfer,Spayed Female,36.501776


In [25]:
dog_df = dog_df.loc[(dog_df['breed']== 'Pit Bull Mix') |
                   (dog_df['breed']== 'Chihuahua Shorthair Mix') |
                   (dog_df['breed']== 'Labrador Retriever Mix') |
                    (dog_df['breed']== 'German Shepherd Mix') |
                    (dog_df['breed']== 'Australian Cattle Dog Mix') |
                    (dog_df['breed']== 'Dachshund Mix') |
                    (dog_df['breed']== 'Boxer Mix') |
                    (dog_df['breed']== 'Miniature Poodle Mix') |
                    (dog_df['breed']== 'Border Collie Mix') |
                    (dog_df['breed']== 'Catahoula Mix')]

dog_df = dog_df.loc[(dog_df['sex_upon_outcome'] != 'Unknown')]

In [26]:
dog_df['sex_upon_outcome'].value_counts()

Neutered Male    8935
Spayed Female    8011
Intact Male      2402
Intact Female    2169
Name: sex_upon_outcome, dtype: int64

In [31]:
new=dog_df['sex_upon_outcome'].str.split(" ", n=1, expand=True)
dog_df["gender"]= new[1] 
dog_df["neutered_intact_spayed"]=new[0]

In [32]:
dog_df.head()

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome,age_months,gender,neutered_intact_spayed
7,3 years,A692618,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23,2014-12-08,2014-12-08T15:55:00,*Ella,Partner,Transfer,Spayed Female,36.501776,Female,Spayed
14,8 years,A690350,Dog,Labrador Retriever Mix,Black,2006-10-18,2014-10-26,2014-10-26T18:20:00,Shy,,Return to Owner,Neutered Male,96.26481,Male,Neutered
16,1 year,A674298,Dog,Pit Bull Mix,Brown Brindle/White,2013-03-11,2014-04-16,2014-04-16T12:51:00,*Newt,Partner,Transfer,Neutered Male,13.174809,Male,Neutered
27,2 months,A667311,Dog,Labrador Retriever Mix,Yellow,2013-09-01,2013-11-19,2013-11-19T18:30:00,Promise,,Adoption,Spayed Female,2.595536,Female,Spayed
28,4 months,A690699,Dog,Chihuahua Shorthair Mix,Blue/Tan,2014-06-17,2014-11-04,2014-11-04T18:03:00,Minnie,Partner,Transfer,Intact Female,4.599684,Female,Intact


In [33]:
#function to group colors 
def color_mapper(color_string):
    split_array = color_string.split('/')
    first_color = split_array[0]
    if (len(split_array) > 1):
        first_color_split = first_color.split(' ')
        return first_color_split[0] + ' Mix'
    else:
        first_color_split = first_color.split(' ')
        return first_color_split[0]
print(color_mapper('Black'))
print(color_mapper('Black/Brown'))
print(color_mapper('Black Blue/White'))
print(color_mapper('Brown Tiger'))

Black
Black Mix
Black Mix
Brown


In [34]:
dog_df['color'] = df['color'].apply(color_mapper)
dog_df

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome,age_months,gender,neutered_intact_spayed
7,3 years,A692618,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23,2014-12-08,2014-12-08T15:55:00,*Ella,Partner,Transfer,Spayed Female,36.501776,Female,Spayed
14,8 years,A690350,Dog,Labrador Retriever Mix,Black,2006-10-18,2014-10-26,2014-10-26T18:20:00,Shy,,Return to Owner,Neutered Male,96.264810,Male,Neutered
16,1 year,A674298,Dog,Pit Bull Mix,Brown Mix,2013-03-11,2014-04-16,2014-04-16T12:51:00,*Newt,Partner,Transfer,Neutered Male,13.174809,Male,Neutered
27,2 months,A667311,Dog,Labrador Retriever Mix,Yellow,2013-09-01,2013-11-19,2013-11-19T18:30:00,Promise,,Adoption,Spayed Female,2.595536,Female,Spayed
28,4 months,A690699,Dog,Chihuahua Shorthair Mix,Blue Mix,2014-06-17,2014-11-04,2014-11-04T18:03:00,Minnie,Partner,Transfer,Intact Female,4.599684,Female,Intact
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78246,2 years,A738395,Dog,Chihuahua Shorthair Mix,Black Mix,2015-11-14,2018-02-01,2018-02-01T18:23:00,Maxie,,Adoption,Neutered Male,26.612456,Male,Neutered
78247,2 years,A761266,Dog,Border Collie Mix,Black Mix,2015-11-01,2018-02-01,2018-02-01T18:32:00,Baily,,Return to Owner,Spayed Female,27.039570,Female,Spayed
78248,1 year,A725872,Dog,German Shepherd Mix,Tan Mix,2016-02-03,2018-02-01,2018-02-01T18:20:00,Lova,,Return to Owner,Intact Female,23.951210,Female,Intact
78249,2 years,A761265,Dog,Border Collie Mix,Black Mix,2015-11-01,2018-02-01,2018-02-01T18:31:00,Lucas,,Return to Owner,Neutered Male,27.039570,Male,Neutered


In [35]:
len(dog_df['color'].unique())

36

In [37]:
# dog_df.groupby('color')['breed'].count()

In [38]:
color = {'Orange': 'Apricot',
         'Liver Mix': 'Brown Mix',
         'Liver': 'Brown',
         'Buff': 'Brown',
         "Buff Mix": 'Brown Mix',
         "Cream": "Yellow",
         "Cream Mix": "Yellow",
         "Fawn": "Yellow",
         "Fawn Mix": "Yellow Mix",
         "Gold": "Yellow",
         "Gold Mix": "Yellow Mix",
         "Sable": 'Brown',
         "Sable Mix": "Brown Mix",
         "Silver Mix": "Gray",
         "Chocolate": "Brown",
         "Chocolate Mix": "Brown Mix"}

dog_df['color'] = dog_df['color'].replace(color)

In [39]:
dog_df['color'].value_counts()

Black Mix       4397
Brown Mix       3463
White Mix       2290
Brown           2069
Tan Mix         1632
Black           1527
Blue Mix        1129
Tan             1065
White            859
Yellow           707
Tricolor         601
Red Mix          519
Red              457
Yellow Mix       343
Blue             322
Gray Mix          39
Tricolor Mix      37
Apricot           27
Gray              25
Apricot Mix       10
Name: color, dtype: int64

In [40]:
#creating color weights
color_weights = set(dog_df['color'].array)
color_weights = list(color_weights)
color_weights

['Yellow',
 'Red',
 'Tan',
 'Tan Mix',
 'Blue',
 'White Mix',
 'Gray Mix',
 'Red Mix',
 'Apricot Mix',
 'Tricolor',
 'Brown Mix',
 'Yellow Mix',
 'Gray',
 'Tricolor Mix',
 'Black',
 'Blue Mix',
 'Brown',
 'White',
 'Apricot',
 'Black Mix']

In [41]:
dog_df['color_weights'] = dog_df['color'].apply(lambda color: color_weights.index(color))
dog_df

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome,age_months,gender,neutered_intact_spayed,color_weights
7,3 years,A692618,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23,2014-12-08,2014-12-08T15:55:00,*Ella,Partner,Transfer,Spayed Female,36.501776,Female,Spayed,16
14,8 years,A690350,Dog,Labrador Retriever Mix,Black,2006-10-18,2014-10-26,2014-10-26T18:20:00,Shy,,Return to Owner,Neutered Male,96.264810,Male,Neutered,14
16,1 year,A674298,Dog,Pit Bull Mix,Brown Mix,2013-03-11,2014-04-16,2014-04-16T12:51:00,*Newt,Partner,Transfer,Neutered Male,13.174809,Male,Neutered,10
27,2 months,A667311,Dog,Labrador Retriever Mix,Yellow,2013-09-01,2013-11-19,2013-11-19T18:30:00,Promise,,Adoption,Spayed Female,2.595536,Female,Spayed,0
28,4 months,A690699,Dog,Chihuahua Shorthair Mix,Blue Mix,2014-06-17,2014-11-04,2014-11-04T18:03:00,Minnie,Partner,Transfer,Intact Female,4.599684,Female,Intact,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78246,2 years,A738395,Dog,Chihuahua Shorthair Mix,Black Mix,2015-11-14,2018-02-01,2018-02-01T18:23:00,Maxie,,Adoption,Neutered Male,26.612456,Male,Neutered,19
78247,2 years,A761266,Dog,Border Collie Mix,Black Mix,2015-11-01,2018-02-01,2018-02-01T18:32:00,Baily,,Return to Owner,Spayed Female,27.039570,Female,Spayed,19
78248,1 year,A725872,Dog,German Shepherd Mix,Tan Mix,2016-02-03,2018-02-01,2018-02-01T18:20:00,Lova,,Return to Owner,Intact Female,23.951210,Female,Intact,3
78249,2 years,A761265,Dog,Border Collie Mix,Black Mix,2015-11-01,2018-02-01,2018-02-01T18:31:00,Lucas,,Return to Owner,Neutered Male,27.039570,Male,Neutered,19


In [44]:
# dog_df['color_weights'].value_counts()
dog_df.columns

Index(['age_upon_outcome', 'animal_id', 'animal_type', 'breed', 'color',
       'date_of_birth', 'datetime', 'monthyear', 'name', 'outcome_subtype',
       'outcome_type', 'sex_upon_outcome', 'age_months', 'gender',
       'neutered_intact_spayed', 'color_weights'],
      dtype='object')

In [45]:
dog_df = dog_df[['breed', 'color', 'outcome_type', 'age_months', 'gender', 
                 'neutered_intact_spayed', 'color_weights']]

In [60]:
dog_df = dog_df.dropna()

In [61]:
dog_df.count()

breed                     21515
color                     21515
outcome_type              21515
age_months                21515
gender                    21515
neutered_intact_spayed    21515
color_weights             21515
breed_weights             21515
dtype: int64

In [62]:
dog_df.columns

Index(['breed', 'color', 'outcome_type', 'age_months', 'gender',
       'neutered_intact_spayed', 'color_weights', 'breed_weights'],
      dtype='object')

In [63]:
#breed list
breed_list = set(dog_df['breed'].array)
breed_list = list(breed_list)
breed_list

['Catahoula Mix',
 'Dachshund Mix',
 'Border Collie Mix',
 'Pit Bull Mix',
 'Boxer Mix',
 'Miniature Poodle Mix',
 'Australian Cattle Dog Mix',
 'Labrador Retriever Mix',
 'German Shepherd Mix',
 'Chihuahua Shorthair Mix']

In [64]:
#create breed weights
dog_df['breed_weights'] = dog_df['breed'].apply(lambda breed: breed_list.index(breed))
dog_df.head()
dog_df['breed_weights'].value_counts()

3    6113
9    4707
7    4562
8    1878
6    1051
1     783
4     662
5     645
2     639
0     475
Name: breed_weights, dtype: int64

In [65]:
dog_df.head()

Unnamed: 0,breed,color,outcome_type,age_months,gender,neutered_intact_spayed,color_weights,breed_weights
7,Chihuahua Shorthair Mix,Brown,Transfer,36.501776,Female,Spayed,16,9
14,Labrador Retriever Mix,Black,Return to Owner,96.26481,Male,Neutered,14,7
16,Pit Bull Mix,Brown Mix,Transfer,13.174809,Male,Neutered,10,3
27,Labrador Retriever Mix,Yellow,Adoption,2.595536,Female,Spayed,0,7
28,Chihuahua Shorthair Mix,Blue Mix,Transfer,4.599684,Female,Intact,15,9


In [66]:
dog_df['outcome_type'].value_counts()
dog_df.count()

breed                     21515
color                     21515
outcome_type              21515
age_months                21515
gender                    21515
neutered_intact_spayed    21515
color_weights             21515
breed_weights             21515
dtype: int64

In [67]:
outcome = {'Rto-Adopt': 'Return to Owner',
          'Disposal': 'Died'}
dog_df['outcome_type'] = dog_df['outcome_type'].replace(outcome)
dog_df['outcome_type'].value_counts()

Adoption           9778
Return to Owner    6248
Transfer           4541
Euthanasia          860
Died                 74
Missing              14
Name: outcome_type, dtype: int64

In [69]:
#outcome list
outcome = set(dog_df['outcome_type'].array)
outcome_list = list(outcome)
outcome_list

#create outcome weights
dog_df['outcome_weights'] = dog_df['outcome_type'].apply(lambda outcome: outcome_list.index(outcome))
dog_df.head()
dog_df['outcome_weights'].value_counts()

1    9778
2    6248
3    4541
5     860
0      74
4      14
Name: outcome_weights, dtype: int64

In [72]:
dog_df.head()

Unnamed: 0,breed,color,outcome_type,age_months,gender,neutered_intact_spayed,color_weights,breed_weights,outcome_weights,gender_weights
7,Chihuahua Shorthair Mix,Brown,Transfer,36.501776,Female,Spayed,16,9,3,1
14,Labrador Retriever Mix,Black,Return to Owner,96.26481,Male,Neutered,14,7,2,0
16,Pit Bull Mix,Brown Mix,Transfer,13.174809,Male,Neutered,10,3,3,0
27,Labrador Retriever Mix,Yellow,Adoption,2.595536,Female,Spayed,0,7,1,1
28,Chihuahua Shorthair Mix,Blue Mix,Transfer,4.599684,Female,Intact,15,9,3,1


In [71]:
#gender list
gender = set(dog_df['gender'].array)
gender_list = list(gender)
gender_list

#create gender weights
dog_df['gender_weights'] = dog_df['gender'].apply(lambda gender: gender_list.index(gender))
dog_df.head()
dog_df['gender_weights'].value_counts()

0    11336
1    10179
Name: gender_weights, dtype: int64

In [74]:
#neutered list
neuter = set(dog_df['neutered_intact_spayed'].array)
neuter_list = list(neuter)
neuter_list

#create gender weights
dog_df['neuter_weights'] = dog_df['neutered_intact_spayed'].apply(lambda neuter: neuter_list.index(neuter))
dog_df.head()
dog_df['neuter_weights'].value_counts()

0    8935
1    8011
2    4569
Name: neuter_weights, dtype: int64

In [75]:
dog_df.head()

Unnamed: 0,breed,color,outcome_type,age_months,gender,neutered_intact_spayed,color_weights,breed_weights,outcome_weights,gender_weights,neuter_weights
7,Chihuahua Shorthair Mix,Brown,Transfer,36.501776,Female,Spayed,16,9,3,1,1
14,Labrador Retriever Mix,Black,Return to Owner,96.26481,Male,Neutered,14,7,2,0,0
16,Pit Bull Mix,Brown Mix,Transfer,13.174809,Male,Neutered,10,3,3,0,0
27,Labrador Retriever Mix,Yellow,Adoption,2.595536,Female,Spayed,0,7,1,1,1
28,Chihuahua Shorthair Mix,Blue Mix,Transfer,4.599684,Female,Intact,15,9,3,1,2


In [77]:
dog_df.columns

Index(['breed', 'color', 'outcome_type', 'age_months', 'gender',
       'neutered_intact_spayed', 'color_weights', 'breed_weights',
       'outcome_weights', 'gender_weights', 'neuter_weights'],
      dtype='object')

In [78]:
final_dog_df = dog_df[['age_months','color_weights', 'breed_weights','outcome_weights', 
                       'gender_weights', 'neuter_weights']]

In [79]:
final_dog_df.head()

Unnamed: 0,age_months,color_weights,breed_weights,outcome_weights,gender_weights,neuter_weights
7,36.501776,16,9,3,1,1
14,96.26481,14,7,2,0,0
16,13.174809,10,3,3,0,0
27,2.595536,0,7,1,1,1
28,4.599684,15,9,3,1,2


In [80]:
final_dog_df.dtypes

age_months         float64
color_weights        int64
breed_weights        int64
outcome_weights      int64
gender_weights       int64
neuter_weights       int64
dtype: object

## ML

In [81]:
feature_names = ["age_months", 'color_weights', 'breed_weights', 'gender_weights', 'neuter_weights']
X = final_dog_df[feature_names]
y = final_dog_df['outcome_weights']
X

Unnamed: 0,age_months,color_weights,breed_weights,gender_weights,neuter_weights
7,36.501776,16,9,1,1
14,96.264810,14,7,0,0
16,13.174809,10,3,0,0
27,2.595536,0,7,1,1
28,4.599684,15,9,1,2
...,...,...,...,...,...
78246,26.612456,19,9,0,0
78247,27.039570,19,2,1,1
78248,23.951210,3,8,1,2
78249,27.039570,19,2,0,0


## Creating training and test sets

In [82]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Applying Scaling

In [83]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Build Models

In [84]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

print('Accuracy of logistic regression on training', logreg.score(X_train_scaled, y_train))
print('Accuracy of logistic regression on testing', logreg.score(X_test_scaled, y_test))

Accuracy of logistic regression on training 0.5654437283093704
Accuracy of logistic regression on testing 0.5718535043688417




In [85]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_scaled, y_train)

print('Accuracy of Decision tree on training', dt.score(X_train_scaled, y_train))
print('Accuracy of Decision tree on testing', dt.score(X_test_scaled, y_test))

Accuracy of Decision tree on training 0.9559370352007932
Accuracy of Decision tree on testing 0.5268637293177171


In [86]:
# Setting max decision tree depth to help avoid overfitting
dt2 = DecisionTreeClassifier(max_depth=3)
dt2.fit(X_train_scaled, y_train)
print('Accuracy of Decision tree on training', dt2.score(X_train_scaled, y_train))
print('Accuracy of Decision tree on testing', dt2.score(X_test_scaled, y_test))

Accuracy of Decision tree on training 0.579387704511651
Accuracy of Decision tree on testing 0.5748280349507343


In [87]:
# K-nearest neighbor 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
print('Accuracy of Knn on training', knn.score(X_train_scaled, y_train))
print('Accuracy of Knn on testing', knn.score(X_test_scaled, y_test))

Accuracy of Knn on training 0.6734630639563709
Accuracy of Knn on testing 0.5644171779141104


In [90]:
# Linear Discriminant Analysis 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_scaled,y_train)
print('Accuracy of Linear Discriminant on training', lda.score(X_train_scaled, y_train))
print('Accuracy of Linear Discriminant on testing', lda.score(X_test_scaled, y_test))

Accuracy of Linear Discriminant on training 0.556271690629648
Accuracy of Linear Discriminant on testing 0.5629299126231642


In [91]:
# Gaussian Naive Bayes 
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)
print('Accuracy of GNB on training', gnb.score(X_train_scaled, y_train))
print('Accuracy of GNB on testing', gnb.score(X_test_scaled, y_test))

Accuracy of GNB on training 0.5621591472483887
Accuracy of GNB on testing 0.5712957798847369


In [92]:
# Support Vector Machine
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train_scaled, y_train)
print('Accuracy of SVM on training', svm.score(X_train_scaled, y_train))
print('Accuracy of SVM on testing', svm.score(X_test_scaled, y_test))



Accuracy of SVM on training 0.5619732275656916
Accuracy of SVM on testing 0.5623721881390593


In [93]:
import pickle
pickle.dump(dt2, open('model.pkl','wb'))

In [103]:
model = pickle.load( open('model.pkl','rb'))
print(model.predict([[12, 18, 9, 0, 0]]))

[2]


In [96]:
model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [132]:
# "age_months", 'color_weights', 'breed_weights', 'gender_weights', 'neuter_weights'
for i in outcome_list:
#     print(i)
    index_num = outcome_list.index(i)
    print(f' {i}: {index_num} <br>')
    
for i in breed_list:
#     print(i)
    index_num = breed_list.index(i)
    print(f' {i}: {index_num} <br>')
    
for i in gender_list:
#     print(i)
    index_num = gender_list.index(i)
    print(f' {i}: {index_num} <br>')
    
for i in neuter_list:
#     print(i)
    index_num = neuter_list.index(i)
    print(f' {i}: {index_num} <br>')

 Died: 0 <br>
 Adoption: 1 <br>
 Return to Owner: 2 <br>
 Transfer: 3 <br>
 Missing: 4 <br>
 Euthanasia: 5 <br>
 Catahoula Mix: 0 <br>
 Dachshund Mix: 1 <br>
 Border Collie Mix: 2 <br>
 Pit Bull Mix: 3 <br>
 Boxer Mix: 4 <br>
 Miniature Poodle Mix: 5 <br>
 Australian Cattle Dog Mix: 6 <br>
 Labrador Retriever Mix: 7 <br>
 German Shepherd Mix: 8 <br>
 Chihuahua Shorthair Mix: 9 <br>
 Male: 0 <br>
 Female: 1 <br>
 Neutered: 0 <br>
 Spayed: 1 <br>
 Intact: 2 <br>


## Weight Legend

<b> Outcome </b> <br>
Died: 0 <br>
 Adoption: 1 <br>
 Return to Owner: 2 <br>
 Transfer: 3 <br>
 Missing: 4 <br>
 Euthanasia: 5 <br>
 
Breed <br>
Catahoula Mix: 0 <br>
 Dachshund Mix: 1 <br>
 Border Collie Mix: 2 <br>
 Pit Bull Mix: 3 <br>
 Boxer Mix: 4 <br>
 Miniature Poodle Mix: 5 <br>
 Australian Cattle Dog Mix: 6 <br>
 Labrador Retriever Mix: 7 <br>
 German Shepherd Mix: 8 <br>
 Chihuahua Shorthair Mix: 9 <br>
 
 Gender <br>
 Male: 0 <br>
 Female: 1 <br>
 
 Neuter <br>
 Neutered: 0 <br>
 Spayed: 1 <br>
 Intact: 2 <br>