In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import math
import statistics
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
import regex as re

In [2]:
# Read Muddy Paws excel data file and assign it to a variable as a dataframe
muddy_p_df = pd.read_excel('muddy_p_original_data.xlsx')

In [3]:
# Set Jupyter notebook to show all columns in the dataframe
# pd.set_option('max_columns', None)

### Update the column names to a format that is friendly for the pandas methods.

In [4]:
# Remove spaces and replace with an underscore (_)
muddy_p_df.columns = muddy_p_df.columns.str.replace(' ', '_')
# Change case to lowercase in all columns
muddy_p_df.columns = muddy_p_df.columns.str.lower()
# Remove apostrophes
muddy_p_df.columns = muddy_p_df.columns.str.replace("'", "")

In [5]:
# Remove question marks ?
muddy_p_df.columns = muddy_p_df.columns.str.replace('?', '', regex = True)
# Remove opening parenthesis (
muddy_p_df.columns = muddy_p_df.columns.str.replace('\\(', '', regex = True)
# Remove closing parenthesis )
muddy_p_df.columns = muddy_p_df.columns.str.replace('\\)', '', regex = True)
# Replace forward slash / with underscore _
muddy_p_df.columns = muddy_p_df.columns.str.replace('/', '_', regex = True)
# Replace .1 with _1
muddy_p_df.columns = muddy_p_df.columns.str.replace('.1', '_1', regex = True)

In [6]:
muddy_p_df.columns

Index(['pet_name', 'pet_breed', 'date_pet_entered_your_care',
       'length_of_stay_days', 'adoption_foster_date', 'current_status',
       'pet_type', 'pet_age', 'pet_owners_cell_number',
       'pet_owners_work_phone', 'pet_owners_home_number', 'petstablished_id',
       'size', 'weight', 'color', 'shedding', 'coat_length', 'temperament',
       'breed_type', 'current_location', 'current_foster_adopter',
       'current_foster_adopter_email', 'current_foster_adopter_phone_number',
       'date_placed_in_current_location', 'microchip_id',
       'microchip_manufacturer', 'adoption_fee', 'internal_id',
       'animal_control_id', 'adoption_application_visible',
       'foster_application_visible', 'last_updated_at', 'gender',
       'previous_owner_information', 'acquired_by', 'date_of_birth',
       'age_in_years', 'declawed_status', 'coat_pattern', 'events_attendance',
       'is_mix', 'shots_up_to_date', 'spayed_neutered', 'hypoallergenic',
       'housebroken', 'special_need', 'ne

In [7]:
# Drop all the columns with personal information
muddy_p_df = muddy_p_df.drop(['pet_owners_cell_number', 'pet_owners_work_phone', 'pet_owners_home_number', 'current_foster_adopter', 
               'current_foster_adopter_email', 'current_foster_adopter_phone_number', 'previous_owner_information', 
                             'current_location'], axis = 1)

In [8]:
muddy_p_df.columns

Index(['pet_name', 'pet_breed', 'date_pet_entered_your_care',
       'length_of_stay_days', 'adoption_foster_date', 'current_status',
       'pet_type', 'pet_age', 'petstablished_id', 'size', 'weight', 'color',
       'shedding', 'coat_length', 'temperament', 'breed_type',
       'date_placed_in_current_location', 'microchip_id',
       'microchip_manufacturer', 'adoption_fee', 'internal_id',
       'animal_control_id', 'adoption_application_visible',
       'foster_application_visible', 'last_updated_at', 'gender',
       'acquired_by', 'date_of_birth', 'age_in_years', 'declawed_status',
       'coat_pattern', 'events_attendance', 'is_mix', 'shots_up_to_date',
       'spayed_neutered', 'hypoallergenic', 'housebroken', 'special_need',
       'needs_foster', 'gets_along_with_cats', 'gets_along_with_dogs',
       'gets_along_with_kids', 'description', 'internal_notes',
       'behavioral_tracking_notes', 'has_finalized_application',
       'where_was_pet_originally_found', 'date_pet_ente

In [9]:
# Change all text data in the dataframe to lowercase to make it easier to work with later
muddy_p_df = muddy_p_df.apply(lambda x: x.astype(str).str.lower())

In [10]:
pd.set_option('max_columns', None)
# muddy_p_df.head(5)

### Change the data types in the columns to better support analysis. This includes changing to datetime, string, binary, and categorical data.

Change pet_name to a string to prepare it to work with later.

In [11]:
# Check pet_name type
print("Old type: ", muddy_p_df.pet_name.dtypes)
# Change pet_name entries to strings
muddy_p_df.pet_name = muddy_p_df.pet_name.astype('string')
# Check pet_name type again
print("New type: ", muddy_p_df.pet_name.dtypes)
# Change the strings to lowercase
muddy_p_df.pet_name = muddy_p_df.pet_name.str.lower()

Old type:  object
New type:  string


Change date columns to datetime data type.

In [12]:
# Check to verify if the date columns are in datetime
print('The data type of date_pet_entered_your_care is: ', muddy_p_df.date_pet_entered_your_care.dtypes)
print('The data type of date_pet_entered_your_care_1 is: ', muddy_p_df.date_pet_entered_your_care_1.dtypes)
print('The data type of last_updated_at is: ', muddy_p_df.last_updated_at.dtypes)
print('The data type of adoption_foster_date is: ', muddy_p_df.adoption_foster_date.dtypes)
print('The data type of date_placed_in_current_location is: ', muddy_p_df.date_placed_in_current_location.dtypes)
print('The data type of date_of_birth is: ', muddy_p_df.date_of_birth.dtypes)

The data type of date_pet_entered_your_care is:  object
The data type of date_pet_entered_your_care_1 is:  object
The data type of last_updated_at is:  object
The data type of adoption_foster_date is:  object
The data type of date_placed_in_current_location is:  object
The data type of date_of_birth is:  object


In [13]:
# When trying to change to datetime, I got an error because one of the columns contained this: '02/08/-4711'
# Replace all the - in these columns with nothing
date_columns = ['date_pet_entered_your_care', 'date_pet_entered_your_care_1', 'last_updated_at', 'adoption_foster_date', 
               'date_placed_in_current_location', 'date_of_birth']
muddy_p_df[date_columns] = muddy_p_df[date_columns].replace({'-':''}, regex = True)

In [14]:
# adoption_foster_date, date_placed_in_current_location, and date_of_birth are all objects
# I want them to be datetime, so I will change them here

# Initiate a list containing the names of the columns I want to change to datetime
col = ['date_pet_entered_your_care', 'date_pet_entered_your_care_1', 'last_updated_at', 'adoption_foster_date', 
               'date_placed_in_current_location', 'date_of_birth']

# Use a for loop to run through the list of column names and change the data types in those columns to datetime
# Got an error for format in one of the rows (4711-02-08 00:00:00), so changing values like that to NaT by using "errors = 'coerce'"
for col in col:
    muddy_p_df[col] = pd.to_datetime(muddy_p_df[col], errors = 'coerce')

In [15]:
# Check the column names to verify that the datetime changes have been made
# muddy_p_df.columns
print('The data type of the date_pet_entered_your_care column is now: ', muddy_p_df.date_pet_entered_your_care.dtypes)
print('The data type of the date_pet_entered_your_care_1 column is now: ', muddy_p_df.date_pet_entered_your_care_1.dtypes)
print('The data type of the last_updated_at column is now: ', muddy_p_df.last_updated_at.dtypes)
print('The data type of adoption_foster_date column is now: ', muddy_p_df.adoption_foster_date.dtypes)
print('The data type of date_placed_in_current_location column is now: ', muddy_p_df.date_placed_in_current_location.dtypes)
print('The data type of of date_of_birth column is now: ', muddy_p_df.date_of_birth.dtypes)

The data type of the date_pet_entered_your_care column is now:  datetime64[ns]
The data type of the date_pet_entered_your_care_1 column is now:  datetime64[ns]
The data type of the last_updated_at column is now:  datetime64[ns]
The data type of adoption_foster_date column is now:  datetime64[ns]
The data type of date_placed_in_current_location column is now:  datetime64[ns]
The data type of of date_of_birth column is now:  datetime64[ns]


Adapt columns with yes/no responses to be binary. Some of them have 'not sure' values. I will be changing those to NaN.

In [16]:
# Replace all 'not sure' responses with NaN
muddy_p_df = muddy_p_df.replace({'not sure': None}, regex = True)

In [17]:
# Replace None and NA values with nan (in preparation for changing yes/no answers to binary)
muddy_p_df = muddy_p_df.fillna(value = np.nan)

In [18]:
# I want columns with only two possible responses to change to binary

# Use .replace to change every no in the dataframe to 0 and every yes in the dataframe to 1
# no=0, yes=1
muddy_p_df = muddy_p_df.replace(to_replace = ['no', 'yes'], value = [0, 1])

Change columns with more than one string response (non-numeric data) to categorical datatype.

In [19]:
# the temperament column has various text data separated by ; and , sometimes

In [20]:
# Could optionally change gender to binary, but currently will leave as categorical.

In [21]:
# I want columns with categories to turn to the categorical data type.
# I will do this using a for loop

# Initate a list of the column names I want to change to categorical
col_cat = ['current_status', 'pet_type', 'pet_age', 'size', 'gender', 'shedding', 'coat_length', 'breed_type', 'spayed_neutered', 
           'declawed_status', 'coat_pattern', 'events_attendance', 'where_was_pet_originally_found', 'coordinators', 
           'type_of_intake']

# Use a for loop to iterate through the list of column names and apply the type 'category' to each one
for col in col_cat:
    muddy_p_df[col_cat] = muddy_p_df[col_cat].astype('category')

### Making new columns from the pet_name column. It contains a lot of information that I don't want in there, but I would still like to keep for later

Prior to cleaning the miscellaneous data out of the pet_name column, we first need to make new columns to contain the relevant data. For instance, we need columns for courtesy posts and cross posts.

In [22]:
# Ascertain whether the pet_name column contains 'Courtesy' and make a new column called 'courtesy_post' with yes or no
muddy_p_df['courtesy_post'] = muddy_p_df.pet_name.apply(lambda row: 'yes' if 'courtesy' in row else 'no')

In [23]:
# Ascertain whether the pet_name column contains 'cross post' and create a new column called 'cross_post' with yes or no
muddy_p_df['cross_post'] = muddy_p_df.pet_name.apply(lambda row: 'yes' if 'cross' in row else 'no')

Create and clean litter_name column

In [24]:
# Using insert(), add empty column at third position
muddy_p_df.insert(2, 'litter_name', " ")

In [25]:
# Initiate a list that contains the strings that indicate an animal is part of a litter
litter_list = ["'s", 'tx', 'ok', 'oklahoma', 'litter', 'kittens', 'kitten', 'pup', 'puppies', 'kitten', 'precious gems']
# Use .loc to copy the values from pet_name to litter_name if pet_name contains any values in litter_list
muddy_p_df.loc[muddy_p_df.pet_name.str.contains('|'.join(litter_list), case = False), 'litter_name'] = muddy_p_df.pet_name

In [26]:
# Some of the rows (such as bonita's litter) have unusual characters in them, so we need to remove those characters
muddy_p_df = muddy_p_df.replace({'litter_name': {'â':'', '€':'', '™':''}}, regex = True)

In [27]:
# Insert another column that identifies if an animal is a litter parent (usually indicated by 'mama' or 'mom')
muddy_p_df.insert(3, 'parent_of_litter', " ")

In [28]:
# Initiate a list including common ways to refer to parents
litter_mama_list = ['mama', 'mom', 'mother', 'dad', 'papa', 'pa', 'ma', 'father']
# use str.extract to grab the parent names and put them in the new parent_of_litter column
muddy_p_df.parent_of_litter = muddy_p_df.litter_name.str.extract(pat = '(mama|mom|dad|papa|father|mother)', expand = False)

In [29]:
# Later, we will be using the word "litter" to split off certain parts of the strings
# We don't want to lose "litter of 10" and "litter of 14" so we are replacing both now with the strings '10' and '14' respectively
muddy_p_df.loc[muddy_p_df.litter_name.str.contains('litter of 10'), 'litter_name'] = '10'
muddy_p_df.loc[muddy_p_df.litter_name.str.contains('litter of 14'), 'litter_name'] = '14'

In [30]:
# Create lists of words to be involved in splitting the litter names off the pet_name column and put them in a new column (litter_name)

# Removes things such as "-forever foster" and "-adoption pending" as well as the litter names
litter_remove_list = ['litter: ', 'litter:', 'litter ', 'litter -','- adop', '-adop', '- for', '-for', '- pend', '-pend', '- no', 
                      'litter ', "'s", 'kitten', 'kittens', '(missouri)','missouri pup', 'pup', 'puppy', 'of the', '(okla', 'blue litter', 'boxer pup', 'pup', 
                     'collie', '11', 'authors', '(tx 13', '(three', 'texas litter', 'texas pup', 'rosie', 'bonita', "joplin's", 
                     'precious gems']

# Focuses on the litters with spaces separating them from the pet_names as opposed to any puncutation
spaces_litter_list = ['blue litter', 'border litter', 'boxer pup', 'jewels pup', 'joplin', 'missouri pup', 
                'texas litter', 'litter']

# Focuses on the 'tx 13' litter. The indications for this litter were inconsistent (i.e. 'alan of the tx 13' and 'penny - tx 13')
tx13_litter_list = ['- tx 13']

# Focuses on miscellaneous litter indications after running through all the lists above
misc_litter_list = ["rosie's pup", "joplin's"]

# Focuses on removing the last bit of puncutation after running through the lists above
punc_remove_list = [')',"'"]

# Focuses on removing  'courtesy post' and 'cross post' after running through the lists above
court_cross = ['court', 'cross', '(court', '(cross', '(courtesy post)']

# Focuses on the joplin's rock legends litter, which had some weird puncuation
final_litter_list = ['joplin']

In [31]:
# Use a for loop to look for any of the litter_remove_list values in the rows of litter_name
for str1 in litter_remove_list:
    # Split the strings into lists on : - 's and ,
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: re.split(":|-|'s|,", row) if str1 in row else row)
    # Pop the end of the list off the list if the row type is a list
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: row.pop(0) if type(row) == list else row)

In [32]:
# Use a for loop to look for any of the litter_remove_list values in the rows of litter_name
for str1 in litter_remove_list:
    # Split the strings into lists on 'of the' and (
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: re.split('of the|\(', row) if str1 in row else row)
    # Pop the beginning of the list off the list if the row type is a list
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: row.pop(-1) if type(row) == list else row)

In [33]:
# Use a for loop to look for any of the spaces_litter_list values in the rows of litter_name
for str1 in spaces_litter_list:
    # Split the strings into lists on 'litter'
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: re.split('litter', row) if str1 in row else row)
    # Pop the end of the list off the list if the row type is a list
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: row.pop(0) if type(row) == list else row)

In [34]:
# # Use a for loop to look for any of the tx13_litter_list values in the rows of litter_name
for str1 in tx13_litter_list:
    # Split the strings into lists on -
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: re.split('-', row) if str1 in row else row)
    # Pop the beginning of the list off the list if the row type is a list
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: row.pop(-1) if type(row) == list else row)

In [35]:
# Use a for loop to look for any of the punc_remove_list values in the rows of litter_name
for str1 in punc_remove_list:
    # Split the strings into lists on )
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: re.split('\)', row) if str1 in row else row)
    # Pop the end of the list off the list if the row type is a list
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: row.pop(0) if type(row) == list else row)

In [36]:
# Use a for loop to look for any of the litter_remove_list values in the rows of litter_name
for str1 in litter_remove_list:
    # Split the strings into lists on 'p '
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: re.split("p ", row) if str1 in row else row)
    # Pop the end of the list off the list if the row type is a list
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: row.pop(0) if type(row) == list else row)

In [37]:
# Use a for loop to look for any of the court_cross values in the rows of litter_name
for str1 in court_cross:
    # Split the strings into lists on ( and :
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: re.split('\(|:', row) if str1 in row else row)
    # Pop the end of the list off the list if the row type is a list
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: row.pop(0) if type(row) == list else row)

In [38]:
# Get rid of leading and ending spaces in each row of the litter_name column
muddy_p_df.litter_name = muddy_p_df.litter_name.str.strip()

In [39]:
# In the litter_name colum, get rid of the last of the apostrophes '
muddy_p_df = muddy_p_df.replace({'litter_name': {"'":''}}, regex = True)

In [40]:
# Use a for loop to look for any of the final_litter_list values in the rows of litter_name
for str1 in final_litter_list:
    # Split the strings into lists on s
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: re.split('s', row) if str1 in row else row)
    # Pop the end of the list off the list if the row type is a list
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: row.pop(0) if type(row) == list else row)

In [41]:
# Use a for loop to look for any of the punc_remove_list values in the rows of litter_name
for str1 in punc_remove_list:
    # Split the strings into lists on s
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: re.split('s', row) if str1 in row else row)
    # Pop the end of the list off the list if the row type is a list
    muddy_p_df.litter_name = muddy_p_df.litter_name.apply(lambda row: row.pop(0) if type(row) == list else row)

Create nickname_or_fka column

In [42]:
# Using insert(), add empty column at third position
muddy_p_df.insert(2, 'other_name', " ")

In [43]:
# Initiate a list that contains the strings that indicate an animal has a nickname, or fka name
other_name_list = ['aka', 'addie', '"gus"', 'bj', 'formerly known as', 'fka', '"ollie"', 'for short', 'pronounced', 
              'sal', 'sammy', 'mazi']
# Use .loc to copy the values from pet_name to litter_name if pet_name contains any values in litter_list
muddy_p_df.loc[muddy_p_df.pet_name.str.contains('|'.join(other_name_list), regex = True, case = False), 'other_name'] = muddy_p_df.pet_name

### Clean pet_name column, now that litter_name, parent_of_litter, and other_name have been derived from it.

In [44]:
pd.set_option('max_rows', None)

In [45]:
# Initiate lists
remove_rightstat_list = ['(for', '-for', '- for', 'for', 'forever foster' '(pend', '-pend', '- pend', '(adop', '-adop', '- adop', '(no', '-no', '- no', 
                    'court', 'cross', 'asop', '/pend', '- court', 'med', '/adop']
remove_left_petname_list = ['litter', '(missouri) -','kittens', 'pup', "'s", 'border', 'divas', "'mummy'", 'guinea', 'gems', 
                           'louisiana', 'tote', '11', 'skittles', 'clause', 'kitten', 'tuna', '-tx 13', '"', 'gecko: ']
remove_right_petname_list = ['oklahoma', 'of the tx 13', '(', '- tx 13', 'chi', 'mother']

In [46]:
# Some of the rows have unusual characters in them, so we need to remove those characters
muddy_p_df = muddy_p_df.replace({'pet_name': {'â':'', '€':'', '™':'', '-':'-'}}, regex = True)

In [47]:
# I made this list when reviewing what the various for loops had done. These entries were throwing things off, so I decided to
# manually remove them
random_edits = ['\(kitten\)', '\(chihuahua\)', '\(yorkie puppy\)',  '\(oklahoma lab litter\)']
# Use str.replace with regex = True to strip characters in the list from the column
muddy_p_df.pet_name = muddy_p_df.pet_name.str.replace('|'.join(random_edits), '', regex = True).str.strip()

In [48]:
# This list is for removing statuses to the right of the name I want to keep
# Statuses include things such as 'adoption pending' and '(forever foster)'

# Initiate for loop
for str1 in remove_rightstat_list:
    # Choose the character strings to on which to split into lists if the row contains a string from the list
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: re.split('\(|//|-adop|- ado|- no|- pend|- cou|\/adop|- for', row) if str1 in row else row)
    # If the element in the row is a list, remove the last entry in the list (on the right)
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: row.pop(0) if type(row) == list else row)

In [49]:
# This list is for removing a more general selection of strings from the right side of the column
# These can be things such as the litter name

# Initiate the loop
for str1 in remove_right_petname_list:
    # Choose the character strings to on which to split into lists if the row contains a string from the list
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: re.split('of the|,|med| - mo, |tx| "addie"', row) if str1 in row else row)
    # If the element in the row is a list, remove the last entry in the list (on the right)
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: row.pop(0) if type(row) == list else row)

In [50]:
# This list is for removing a more general selection of strings (usually litter names) from the left side of the column

# Initiate the loop
for str1 in remove_left_petname_list:
    # Choose the character strings to on which to split into lists if the row contains a string from the list
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: re.split(":|'s|litter -|puppy|litter|pup|tunas -|'mummy'", row) if str1 in row else row)
    # If the element in the row is a list, remove the first entry in the list (on the left)
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: row.pop(-1) if type(row) == list else row)

I often had to repeat for loops using different splitting characters because certain characters needed to be removed before others. Otherwise, I would remove things that I did not wish to remove.

In [51]:
# Using the left_petname list again with a different delmiting character

# Initiate the loop
for str1 in remove_left_petname_list:
    # Choose the character strings to on which to split into lists if the row contains a string from the list
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: re.split('\) -', row) if str1 in row else row)
    # If the element in the row is a list, remove the first entry in the list (on the left)
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: row.pop(-1) if type(row) == list else row)

In [52]:
# Get rid of leading and ending spaces in each row of the pet_name column
muddy_p_df.pet_name = muddy_p_df.pet_name.str.strip()

Here, I got really tired of dealing with the very finicky ordering of the for loops and just scrolled through looking for whatever characters were left. It was starting to become a really big time sink, so I sacrificed elegance for speed.

In [53]:
# This is a list of the various strings my for loops didn't pick up, or that were leftover after running them all
last_edits = ['"addie"', '"gus"', "'", '-', '"artie"', '\(addy\)', '\(mazi\)', ' \(aussie/border collie\)', ' \(yorkie\)--', 
             ' ,no longer taking applications', '. fka lacy', '"cash"', ' pending', 'asoption', 'aka clark', ' " doogie"', 
             '/pending', ', ', ' "trooper"', ' longer accepting applications', 'pending adoption', '\)', '"ollie"', 
             '   mother of aladdin', '  medical hold until 9 weeks', ' /', ':  terrier/poodle']
# Implementing stripping the strings from the list above from the pet_name column
muddy_p_df.pet_name = muddy_p_df.pet_name.str.replace('|'.join(last_edits), '', regex = True).str.strip()

In [54]:
# Adding what is, hopefully, the very last list I will have to use for the pet_name column
last_edits = ['(']

# Initiating the for loop
for str1 in last_edits:
    # Choose the character strings to on which to split into lists if the row contains a string from the list
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: re.split('\(', row) if str1 in row else row)
    # If the element in the row is a list, remove the last entry in the list (on the right)
    muddy_p_df.pet_name = muddy_p_df.pet_name.apply(lambda row: row.pop(0) if type(row) == list else row)

In [55]:
# Get rid of leading and ending spaces in each row of the pet_name column
muddy_p_df.pet_name = muddy_p_df.pet_name.str.strip()

I think there's one blank name in there, but it's not worth going back and arranging all the for loops again. This is about as good as the pet_name column is gonna get.

In [56]:
# Get rid of leading and ending spaces in each row of the pet_name column
muddy_p_df.pet_name = muddy_p_df.pet_name.str.strip()

Quickly changing the courtesy_post and cross_post columns to binary.

In [57]:
# I want columns with only two possible responses to change to binary

# Use .replace to change every no in the dataframe to 0 and every yes in the dataframe to 1
# no=0, yes=1
muddy_p_df = muddy_p_df.replace(to_replace = ['no', 'yes'], value = [0, 1])

### End of Part 1

In [58]:
muddy_p_df.to_excel('part_1_mp_cleaning.xlsx')