In [1]:
import sklearn
from sklearn import datasets
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import datetime as dt

In [2]:
intakes = pd.read_csv('data/Austin_Animal_Center_Intakes.csv',parse_dates=['DateTime','MonthYear'])
intakes.columns = intakes.columns.str.replace(" ","_")

In [3]:
outcomes = pd.read_csv('data/Austin_Animal_Center_Outcomes.csv',parse_dates=['DateTime','MonthYear','Date of Birth'])
outcomes.columns = outcomes.columns.str.replace(' ','_')

In [4]:
col_intersect = sorted(list(set(intakes.columns).intersection(set(outcomes.columns))))
col_intersect.remove("DateTime")
col_intersect.remove("MonthYear")

In [5]:
df = pd.merge(intakes, outcomes, on=col_intersect,suffixes=('_in', '_out') )
df.sort_index(axis=1,inplace=True)

In [6]:
print("intakes:    ",len(intakes))
print("outcomes:   ",len(outcomes))
print("total rows: ", len(intakes)+len(outcomes))
print("unique #:   ", len(df))

intakes:     125929
outcomes:    126165
total rows:  252094
unique #:    162656


In [7]:
df

Unnamed: 0,Age_upon_Intake,Age_upon_Outcome,Animal_ID,Animal_Type,Breed,Color,DateTime_in,DateTime_out,Date_of_Birth,Found_Location,Intake_Condition,Intake_Type,MonthYear_in,MonthYear_out,Name,Outcome_Subtype,Outcome_Type,Sex_upon_Intake,Sex_upon_Outcome
0,2 years,2 years,A786884,Dog,Beagle Mix,Tricolor,2019-01-03 16:19:00,2019-01-08 15:11:00,2017-01-03,2501 Magin Meadow Dr in Austin (TX),Normal,Stray,2019-01-03 16:19:00,2019-01-08 15:11:00,*Brock,Partner,Transfer,Neutered Male,Neutered Male
1,8 years,8 years,A706918,Dog,English Springer Spaniel,White/Liver,2015-07-05 12:59:00,2015-07-05 15:13:00,2007-07-05,9409 Bluegrass Dr in Austin (TX),Normal,Stray,2015-07-05 12:59:00,2015-07-05 15:13:00,Belle,,Return to Owner,Spayed Female,Spayed Female
2,11 months,1 year,A724273,Dog,Basenji Mix,Sable/White,2016-04-14 18:43:00,2016-04-21 17:17:00,2015-04-17,2818 Palomino Trail in Austin (TX),Normal,Stray,2016-04-14 18:43:00,2016-04-21 17:17:00,Runster,,Return to Owner,Intact Male,Neutered Male
3,4 weeks,4 weeks,A665644,Cat,Domestic Shorthair Mix,Calico,2013-10-21 07:59:00,2013-10-21 11:39:00,2013-09-21,Austin (TX),Sick,Stray,2013-10-21 07:59:00,2013-10-21 11:39:00,,Partner,Transfer,Intact Female,Intact Female
4,4 years,4 years,A682524,Dog,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,2014-06-29 10:38:00,2014-07-02 14:16:00,2010-06-29,800 Grove Blvd in Austin (TX),Normal,Stray,2014-06-29 10:38:00,2014-07-02 14:16:00,Rio,,Return to Owner,Neutered Male,Neutered Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162651,2 years,,A834042,Other,Fox,Brown/White,2021-05-09 15:52:00,2021-05-09 16:40:00,2019-05-09,4809 Green Shore Cir in Travis (TX),Normal,Wildlife,2021-05-09 15:52:00,2021-05-09 16:40:00,,Rabies Risk,Euthanasia,Unknown,Unknown
162652,5 months,6 months,A832580,Dog,Labrador Retriever Mix,Black/White,2021-04-15 16:56:00,2021-05-05 17:30:00,2020-10-15,Austin (TX),Normal,Public Assist,2021-04-15 16:56:00,2021-05-05 17:30:00,*Cake,,Adoption,Intact Male,Neutered Male
162653,8 years,8 years,A821181,Dog,Great Pyrenees,White,2020-08-05 16:19:00,2020-08-12 12:58:00,2012-08-05,Travis (TX),Normal,Owner Surrender,2020-08-05 16:19:00,2020-08-12 12:58:00,Paloma,Partner,Transfer,Spayed Female,Spayed Female
162654,16 years,16 years,A814533,Dog,Boxer,White,2021-05-09 16:03:00,2021-05-09 18:28:00,2004-12-16,Manor Road And Overbrook Drive in Austin (TX),Sick,Public Assist,2021-05-09 16:03:00,2021-05-09 18:28:00,Avalanche,,Return to Owner,Intact Male,Intact Male


In [8]:
df.drop(index=df[df.Age_upon_Outcome.isna()==True].index, inplace=True)
#MONTHYEAR IS SAME AS DATETIME
df.drop(columns=['MonthYear_in', 'MonthYear_out'],inplace=True)

In [11]:
df['DateTime_in'][0]

Timestamp('2019-01-03 16:19:00')

In [10]:
# LOOK FOR AND SWITCH WHEN DATETIME_IN IS LATER THAN DATETIME_OUT
idx = (df.DateTime_out<df.DateTime_in)
#IF IN IS LATER THAN OUT, SWTICH
df.loc[idx,['DateTime_out','DateTime_in']] = df.loc[idx,['DateTime_in','DateTime_out']].astype('datetime64').astype(int).astype(float)
#df.loc[idx,['DateTime_out','DateTime_in']] = df.loc[idx,['DateTime_in','DateTime_out']].astype('datetime64').astype(int).astype(float).values
# df.loc[idx,['DateTime_out','DateTime_in']] = df.loc[idx,['DateTime_out','DateTime_in']].astype('datetime64')
df['DateTime_in'] = pd.to_datetime(df['DateTime_in'],errors='coerce')
df['DateTime_out'] = pd.to_datetime(df['DateTime_out'],errors='coerce')

#AGES APPEAR TO BE SWITCHED FOR ALL OF THESE OBSERVATIONS AS WELL. SWITCH THESE TWO COLUMNS
df.loc[idx,['Age_upon_Outcome', 'Age_upon_Intake']] = df.loc[idx,['Age_upon_Intake', 'Age_upon_Outcome']].values

In [12]:
# LOOK FOR WHEN DATETIME_IN IS EARLIER THAN DATE_OF_BIRTH
wrong_dob = (df.Date_of_Birth > df.DateTime_in)
#likely reasoning is DateTime_in is when the mother was brought into the center and the DOB is when the babies were born
#given this assumption, change all DateTime_in to DOB 
df.loc[wrong_dob, ['DateTime_in']] = df.loc[wrong_dob, ['Date_of_Birth']]

In [13]:
# CREATE A TRUE DURATION IN DAYS FOR IN/OUT 
df['Duration_Days']= (df.DateTime_out - df.DateTime_in).dt.days 

In [14]:
#GIVEN AGES AREN'T RELIABLE. 
#CALCULATE AGE UPON INTAKE USING DATE OF BIRTH AND DATETIMEIN
#CALCULATE AGE UPON OUTCOME USING DATE OF BIRTH AND DATETIMEOUT
df['Calc_Age_In'] = round((df['DateTime_in'] - df['Date_of_Birth'])/ np.timedelta64(1, 'Y'),1)
df['Calc_Age_Out'] = round((df['DateTime_out'] - df['Date_of_Birth'])/ np.timedelta64(1, 'Y'),1)

### if datetime isn't working
#df['Calc_Age_Out'] = round((pd.to_datetime(df['DateTime_out']) - pd.to_datetime(df['Date_of_Birth'])).dt.days/365,1)
#df['Calc_Age_In'] = round((pd.to_datetime(df['DateTime_in']) - pd.to_datetime(df['Date_of_Birth'])).dt.days/365,1)

In [15]:
#Create Alternative to Altered Status + Sex in each intake and outcome 
df['Altered'] = df['Sex_upon_Intake']!=df['Sex_upon_Outcome']
df['Sex'] = df.Sex_upon_Intake.str.split(" ").str[-1]
#Can drop sex_upon_ columns 

In [16]:
#Create Is Mixed Column
df['Mixed']= df.Breed.str.contains("Mix|/")

In [17]:
breed = pd.DataFrame(df[['Breed','Mixed']])
breed['Without_Mix'] = breed["Breed"].str.replace("Mix","")

df['Primary_Breed'] = breed["Without_Mix"].str.split("/", n = 2, expand = True)[0].str.rstrip()
df['Secondary_Breed'] = breed["Without_Mix"].str.split("/", n = 2, expand = True)[1].str.rstrip()
df['Tertiary_Breed'] = breed["Without_Mix"].str.split("/", n = 2, expand = True)[2].str.rstrip()


In [18]:
df['Month_In'] = pd.to_datetime(df['DateTime_in']).dt.month_name()
df['Month_Out'] = df['DateTime_out'].dt.month_name()

df['Day_In'] = df['DateTime_in'].dt.day_name()
df['WeekDay_In'] = df['DateTime_in'].dt.weekday

In [19]:
df['Intake_Condition'].unique()

array(['Normal', 'Sick', 'Injured', 'Nursing', 'Aged', 'Medical', 'Other',
       'Feral', 'Pregnant', 'Behavior'], dtype=object)

In [20]:
df['Intake_Type'].unique()

array(['Stray', 'Owner Surrender', 'Public Assist', 'Wildlife',
       'Euthanasia Request', 'Abandoned'], dtype=object)

In [21]:
df.columns

Index(['Age_upon_Intake', 'Age_upon_Outcome', 'Animal_ID', 'Animal_Type',
       'Breed', 'Color', 'DateTime_in', 'DateTime_out', 'Date_of_Birth',
       'Found_Location', 'Intake_Condition', 'Intake_Type', 'Name',
       'Outcome_Subtype', 'Outcome_Type', 'Sex_upon_Intake',
       'Sex_upon_Outcome', 'Duration_Days', 'Calc_Age_In', 'Calc_Age_Out',
       'Altered', 'Sex', 'Mixed', 'Primary_Breed', 'Secondary_Breed',
       'Tertiary_Breed', 'Month_In', 'Month_Out', 'Day_In', 'WeekDay_In'],
      dtype='object')

In [22]:
basic = df[['Animal_Type','Calc_Age_In','Calc_Age_Out','Color','Breed', 'Duration_Days','Altered', 'Sex','Mixed']]

In [23]:
basic

Unnamed: 0,Animal_Type,Calc_Age_In,Calc_Age_Out,Color,Breed,Duration_Days,Altered,Sex,Mixed
0,Dog,2.0,2.0,Tricolor,Beagle Mix,4.0,False,Male,True
1,Dog,8.0,8.0,White/Liver,English Springer Spaniel,0.0,False,Female,False
2,Dog,1.0,1.0,Sable/White,Basenji Mix,6.0,True,Male,True
3,Cat,0.1,0.1,Calico,Domestic Shorthair Mix,0.0,False,Female,True
4,Dog,4.0,4.0,Tan/Gray,Doberman Pinsch/Australian Cattle Dog,3.0,False,Male,True
...,...,...,...,...,...,...,...,...,...
162649,Dog,1.5,1.5,Black/White,American Staffordshire Terrier,0.0,False,Female,False
162650,Dog,1.1,1.1,Tan,Black Mouth Cur,0.0,False,Male,False
162652,Dog,0.5,0.6,Black/White,Labrador Retriever Mix,20.0,True,Male,True
162653,Dog,8.0,8.0,White,Great Pyrenees,6.0,False,Female,False


In [24]:
dog_basic = basic[basic['Animal_Type']=='Dog']

In [25]:
dog_basic.drop(columns='Animal_Type',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [26]:
dog_basic

Unnamed: 0,Calc_Age_In,Calc_Age_Out,Color,Breed,Duration_Days,Altered,Sex,Mixed
0,2.0,2.0,Tricolor,Beagle Mix,4.0,False,Male,True
1,8.0,8.0,White/Liver,English Springer Spaniel,0.0,False,Female,False
2,1.0,1.0,Sable/White,Basenji Mix,6.0,True,Male,True
4,4.0,4.0,Tan/Gray,Doberman Pinsch/Australian Cattle Dog,3.0,False,Male,True
5,2.0,2.0,Chocolate,Labrador Retriever Mix,3.0,False,Male,True
...,...,...,...,...,...,...,...,...
162649,1.5,1.5,Black/White,American Staffordshire Terrier,0.0,False,Female,False
162650,1.1,1.1,Tan,Black Mouth Cur,0.0,False,Male,False
162652,0.5,0.6,Black/White,Labrador Retriever Mix,20.0,True,Male,True
162653,8.0,8.0,White,Great Pyrenees,6.0,False,Female,False


In [27]:
df[['Primary_Breed','Secondary_Breed','Tertiary_Breed']]

Unnamed: 0,Primary_Breed,Secondary_Breed,Tertiary_Breed
0,Beagle,,
1,English Springer Spaniel,,
2,Basenji,,
3,Domestic Shorthair,,
4,Doberman Pinsch,Australian Cattle Dog,
...,...,...,...
162649,American Staffordshire Terrier,,
162650,Black Mouth Cur,,
162652,Labrador Retriever,,
162653,Great Pyrenees,,


In [None]:
df[['Animal_Type','Calc_Age_In','Calc_Age_Out','Color','Breed', 'Duration_Days','Altered', 'Sex','Mixed']]

### All Dog Breeds

In [28]:
df[df.Animal_Type=="Dog"][['Primary_Breed','Secondary_Breed','Tertiary_Breed']]

Unnamed: 0,Primary_Breed,Secondary_Breed,Tertiary_Breed
0,Beagle,,
1,English Springer Spaniel,,
2,Basenji,,
4,Doberman Pinsch,Australian Cattle Dog,
5,Labrador Retriever,,
...,...,...,...
162649,American Staffordshire Terrier,,
162650,Black Mouth Cur,,
162652,Labrador Retriever,,
162653,Great Pyrenees,,


In [29]:
all_dog = df[df.Animal_Type=="Dog"]

In [30]:
all_dog[['Primary_Breed','Secondary_Breed','Tertiary_Breed']]

Unnamed: 0,Primary_Breed,Secondary_Breed,Tertiary_Breed
0,Beagle,,
1,English Springer Spaniel,,
2,Basenji,,
4,Doberman Pinsch,Australian Cattle Dog,
5,Labrador Retriever,,
...,...,...,...
162649,American Staffordshire Terrier,,
162650,Black Mouth Cur,,
162652,Labrador Retriever,,
162653,Great Pyrenees,,


In [32]:
all_dog['Secondary_Breed'].isna().sum()

86433

In [33]:
all_dog['Tertiary_Breed'].isna().sum()

103517

In [34]:
all_dog['Primary_Breed'].isna().sum()

0

In [83]:
sorted(all_dog["Primary_Breed"].unique())

['Affenpinscher',
 'Afghan Hound',
 'Airedale Terrier',
 'Akbash',
 'Akita',
 'Alaskan Husky',
 'Alaskan Klee Kai',
 'Alaskan Malamute',
 'American Bulldog',
 'American Eskimo',
 'American Foxhound',
 'American Pit Bull Terrier',
 'American Staffordshire Terrier',
 'Anatol Shepherd',
 'Australian Cattle Dog',
 'Australian Kelpie',
 'Australian Shepherd',
 'Australian Terrier',
 'Basenji',
 'Basset Hound',
 'Beagle',
 'Bearded Collie',
 'Beauceron',
 'Bedlington Terr',
 'Belgian Malinois',
 'Belgian Sheepdog',
 'Belgian Tervuren',
 'Bernese Mountain Dog',
 'Bichon Frise',
 'Black',
 'Black Mouth Cur',
 'Bloodhound',
 'Blue Lacy',
 'Bluetick Hound',
 'Boerboel',
 'Border Collie',
 'Border Terrier',
 'Boston Terrier',
 'Bouv Flandres',
 'Boxer',
 'Boykin Span',
 'Briard',
 'Brittany',
 'Bruss Griffon',
 'Bull Terrier',
 'Bull Terrier Miniature',
 'Bulldog',
 'Bullmastiff',
 'Cairn Terrier',
 'Canaan Dog',
 'Cane Corso',
 'Cardigan Welsh Corgi',
 'Carolina Dog',
 'Catahoula',
 'Cavalier Sp

In [35]:
dogbreeds = sorted(list(set().union(all_dog['Primary_Breed'].dropna().unique(), all_dog['Secondary_Breed'].dropna().unique(), all_dog['Tertiary_Breed'].dropna().unique())))

In [36]:
len(dogbreeds)

209

In [145]:
len(breeds)

206

In [132]:
len(dogbreeds)

209

In [128]:
len(dogbreeds)

209

In [None]:
list(set(chain.from_iterable()))

In [40]:
[i for i in list(all_dog['Tertiary_Breed']) if i not in dogbreeds]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [88]:
dog_basic.drop(index=dog_basic[dog_basic['Sex'].isna()==True].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [44]:
bas_df = dog_basic[['Calc_Age_In','Duration_Days','Sex']]

In [58]:
all_dog.columns

Index(['Age_upon_Intake', 'Age_upon_Outcome', 'Animal_ID', 'Animal_Type',
       'Breed', 'Color', 'DateTime_in', 'DateTime_out', 'Date_of_Birth',
       'Found_Location', 'Intake_Condition', 'Intake_Type', 'Name',
       'Outcome_Subtype', 'Outcome_Type', 'Sex_upon_Intake',
       'Sex_upon_Outcome', 'Duration_Days', 'Calc_Age_In', 'Calc_Age_Out',
       'Altered', 'Sex', 'Mixed', 'Primary_Breed', 'Secondary_Breed',
       'Tertiary_Breed', 'Month_In', 'Month_Out', 'Day_In', 'WeekDay_In'],
      dtype='object')

In [59]:
bas_df = all_dog[['Calc_Age_In','Duration_Days','Sex','Primary_Breed', 'Secondary_Breed', 'Tertiary_Breed']]

In [93]:
bas_df['Calc_Age_In']

0          2.0
1          8.0
2          1.0
4          4.0
5          2.0
          ... 
162649     1.5
162650     1.1
162652     0.5
162653     8.0
162654    16.4
Name: Calc_Age_In, Length: 103573, dtype: float64

In [96]:
bas_df['Duration_Days'].round()

0          4.0
1          0.0
2          6.0
4          3.0
5          3.0
          ... 
162649     0.0
162650     0.0
162652    20.0
162653     6.0
162654     0.0
Name: Duration_Days, Length: 103573, dtype: float64

In [97]:
y = pd.DataFrame(bas_df['Duration_Days'])
X = bas_df.drop("Duration_Days", axis=1)

In [98]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sn
import statsmodels.discrete.discrete_model as sm

In [107]:
lm = LinearRegression()

In [108]:
lm.fit(X,y)

ValueError: could not convert string to float: 'Male'

In [103]:
X_cons = sn.add_constant(X)

In [105]:
lm = sn.OLS(,X_cons.astype(float)).fit()

ValueError: could not convert string to float: 'Male'

In [205]:
y = dog_basic['Duration_Days']
X = dog_basic.drop("Duration_Days", axis=1)

In [208]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sn
import statsmodels.discrete.discrete_model as sm

In [89]:
all_dog['Color_1'] = all_dog["Color"].str.split("/", n = 1, expand = True)[0]
all_dog['Color_2'] = all_dog["Color"].str.split("/", n = 1, expand = True)[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_dog['Color_1'] = all_dog["Color"].str.split("/", n = 1, expand = True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_dog['Color_2'] = all_dog["Color"].str.split("/", n = 1, expand = True)[1]


In [90]:
all_dog.columns

Index(['Age_upon_Intake', 'Age_upon_Outcome', 'Animal_ID', 'Animal_Type',
       'Breed', 'Color', 'DateTime_in', 'DateTime_out', 'Date_of_Birth',
       'Found_Location', 'Intake_Condition', 'Intake_Type', 'Name',
       'Outcome_Subtype', 'Outcome_Type', 'Sex_upon_Intake',
       'Sex_upon_Outcome', 'Duration_Days', 'Calc_Age_In', 'Calc_Age_Out',
       'Altered', 'Sex', 'Mixed', 'Primary_Breed', 'Secondary_Breed',
       'Tertiary_Breed', 'Month_In', 'Month_Out', 'Day_In', 'WeekDay_In',
       'Color_1', 'Color_2'],
      dtype='object')

In [92]:
all_dog.drop(columns=['Animal_ID', 'Animal_Type', 'Found_Location','Name'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [93]:
all_dog

Unnamed: 0,Age_upon_Intake,Age_upon_Outcome,Breed,Color,DateTime_in,DateTime_out,Date_of_Birth,Intake_Condition,Intake_Type,Outcome_Subtype,...,Mixed,Primary_Breed,Secondary_Breed,Tertiary_Breed,Month_In,Month_Out,Day_In,WeekDay_In,Color_1,Color_2
0,2 years,2 years,Beagle Mix,Tricolor,2019-01-03 16:19:00,2019-01-08 15:11:00,2017-01-03,Normal,Stray,Partner,...,True,Beagle,,,January,January,Thursday,3.0,Tricolor,
1,8 years,8 years,English Springer Spaniel,White/Liver,2015-07-05 12:59:00,2015-07-05 15:13:00,2007-07-05,Normal,Stray,,...,False,English Springer Spaniel,,,July,July,Sunday,6.0,White,Liver
2,11 months,1 year,Basenji Mix,Sable/White,2016-04-14 18:43:00,2016-04-21 17:17:00,2015-04-17,Normal,Stray,,...,True,Basenji,,,April,April,Thursday,3.0,Sable,White
4,4 years,4 years,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,2014-06-29 10:38:00,2014-07-02 14:16:00,2010-06-29,Normal,Stray,,...,True,Doberman Pinsch,Australian Cattle Dog,,June,July,Sunday,6.0,Tan,Gray
5,2 years,2 years,Labrador Retriever Mix,Chocolate,2017-02-18 12:46:00,2017-02-21 17:44:00,2015-02-18,Normal,Owner Surrender,,...,True,Labrador Retriever,,,February,February,Saturday,5.0,Chocolate,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162649,1 year,1 year,American Staffordshire Terrier,Black/White,2021-05-09 08:59:00,2021-05-09 11:57:00,2019-11-09,Normal,Stray,,...,False,American Staffordshire Terrier,,,May,May,Sunday,6.0,Black,White
162650,1 year,1 year,Black Mouth Cur,Tan,2021-05-08 17:51:00,2021-05-09 14:39:00,2020-03-21,Normal,Stray,,...,False,Black Mouth Cur,,,May,May,Saturday,5.0,Tan,
162652,5 months,6 months,Labrador Retriever Mix,Black/White,2021-04-15 16:56:00,2021-05-05 17:30:00,2020-10-15,Normal,Public Assist,,...,True,Labrador Retriever,,,April,May,Thursday,3.0,Black,White
162653,8 years,8 years,Great Pyrenees,White,2020-08-05 16:19:00,2020-08-12 12:58:00,2012-08-05,Normal,Owner Surrender,Partner,...,False,Great Pyrenees,,,August,August,Wednesday,2.0,White,


In [176]:
all_dog['Color_A'] = [i.split()[0] for i in all_dog['Color_1']]

In [182]:
all_dog['Color_A'] = all_dog["Color_1"].str.split(n = 1, expand = True)[0]
all_dog['Color_B'] = all_dog["Color_2"].str.split(n = 1, expand = True)[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_dog['Color_A'] = all_dog["Color_1"].str.split(n = 1, expand = True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_dog['Color_B'] = all_dog["Color_2"].str.split(n = 1, expand = True)[0]


In [193]:
basic_dog_colors = set(all_dog['Color_A'].unique()).union(set(all_dog['Color_B'].dropna().unique()))

In [189]:
sorted(all_dog['Color_B'].dropna().unique())

['Apricot',
 'Black',
 'Blue',
 'Brown',
 'Buff',
 'Chocolate',
 'Cream',
 'Fawn',
 'Gold',
 'Gray',
 'Liver',
 'Orange',
 'Pink',
 'Red',
 'Silver',
 'Tan',
 'Tortie',
 'Tricolor',
 'White',
 'Yellow']

In [183]:
all_dog

Unnamed: 0,Age_upon_Intake,Age_upon_Outcome,Animal_ID,Animal_Type,Breed,Color,DateTime_in,DateTime_out,Date_of_Birth,Found_Location,...,Altered,Sex,Mixed,Primary_Breed,Secondary_Breed,Tertiary_Breed,Color_1,Color_2,Color_A,Color_B
0,2 years,2 years,A786884,Dog,Beagle Mix,Tricolor,2019-01-03 16:19:00,2019-01-08 15:11:00,2017-01-03,2501 Magin Meadow Dr in Austin (TX),...,False,Male,True,Beagle,,,Tricolor,,Tricolor,
1,8 years,8 years,A706918,Dog,English Springer Spaniel,White/Liver,2015-07-05 12:59:00,2015-07-05 15:13:00,2007-07-05,9409 Bluegrass Dr in Austin (TX),...,False,Female,False,English Springer Spaniel,,,White,Liver,White,Liver
2,11 months,1 year,A724273,Dog,Basenji Mix,Sable/White,2016-04-14 18:43:00,2016-04-21 17:17:00,2015-04-17,2818 Palomino Trail in Austin (TX),...,True,Male,True,Basenji,,,Sable,White,Sable,White
4,4 years,4 years,A682524,Dog,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,2014-06-29 10:38:00,2014-07-02 14:16:00,2010-06-29,800 Grove Blvd in Austin (TX),...,False,Male,True,Doberman Pinsch,Australian Cattle Dog,,Tan,Gray,Tan,Gray
5,2 years,2 years,A743852,Dog,Labrador Retriever Mix,Chocolate,2017-02-18 12:46:00,2017-02-21 17:44:00,2015-02-18,Austin (TX),...,False,Male,True,Labrador Retriever,,,Chocolate,,Chocolate,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159723,2 months,3 months,A825881,Dog,Bull Terrier Mix,Brown Brindle/White,2020-11-14 13:39:00,2021-01-11 16:55:00,2020-09-13,Teri Road And Pleasant Valley Road in Austin (TX),...,True,Male,True,Bull Terrier,,,Brown Brindle,White,Brown,White
159724,3 months,2 months,A825881,Dog,Bull Terrier Mix,Brown Brindle/White,2020-12-23 13:10:00,2020-11-25 15:23:00,2020-09-13,Austin (TX),...,False,Male,True,Bull Terrier,,,Brown Brindle,White,Brown,White
159725,3 months,3 months,A825881,Dog,Bull Terrier Mix,Brown Brindle/White,2020-12-23 13:10:00,2021-01-11 16:55:00,2020-09-13,Austin (TX),...,False,Male,True,Bull Terrier,,,Brown Brindle,White,Brown,White
159726,2 years,2 years,A828968,Dog,Labrador Retriever,Black,2021-01-30 08:14:00,2021-01-30 12:11:00,2019-01-30,1111 Bastrop Highway in Austin (TX),...,False,Female,False,Labrador Retriever,,,Black,,Black,


In [161]:
detail_colors = set(all_dog["Color"].str.split("/", n = 1, expand = True)[0].dropna().unique()).union(set(all_dog["Color"].str.split("/", n = 1, expand = True)[1].dropna().unique()))

In [174]:
set([i.split()[0] for i in detail_colors])

['Tricolor',
 'White',
 'Sable',
 'Tan',
 'Chocolate',
 'Black',
 'Black',
 'White',
 'Brown',
 'Tricolor',
 'Tan',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'White',
 'White',
 'Brown',
 'Black',
 'Tan',
 'White',
 'White',
 'White',
 'White',
 'White',
 'White',
 'White',
 'White',
 'White',
 'Black',
 'Brown',
 'Tan',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Black',
 'Chocolate',
 'Chocolate',
 'Chocolate',
 'Chocolate',
 'Tan',
 'White',
 'White',
 'White',
 'White',
 'White',
 'White',
 'White',
 'White',
 'White',
 'Black',
 'Brown',
 'Sable',
 'White',
 'Black',
 'Brown',
 'Brown',
 'Brown',
 'Brown',
 'Br

In [None]:
[i.split()[0] for i in l]

In [None]:
# new data frame with split value columns 
new_col = aus["Color"].str.split("/", n = 1, expand = True) 
# making separate first name column from new data frame 
aus["Color_A"]= new_col[0]  
# making separate last name column from new data frame 
aus["Color_B"]= new_col[1] 

do a column for AM/PM

In [76]:
df[df['Breed'].str.contains("/|-")]

Unnamed: 0,Age_upon_Intake,Age_upon_Outcome,Animal_ID,Animal_Type,Breed,Calc_Age_In,Calc_Age_Out,Color,DateTime_in,DateTime_out,...,Duration_Days,Found_Location,Intake_Condition,Intake_Type,Name,Outcome_Subtype,Outcome_Type,Sex_upon_Intake,Sex_upon_Outcome,Mixed
4,4 years,4 years,A682524,Dog,Doberman Pinsch/Australian Cattle Dog,4.0,4.0,Tan/Gray,2014-06-29 10:38:00,2014-07-02 14:16:00,...,3,800 Grove Blvd in Austin (TX),Normal,Stray,Rio,,Return to Owner,Neutered Male,Neutered Male,True
24,2 months,2 months,A697950,Dog,Australian Cattle Dog/Labrador Retriever,0.2,0.2,Tan/White,2015-03-04 11:22:00,2015-03-08 18:55:00,...,4,1501 S Fm 973 in Austin (TX),Normal,Stray,,,Adoption,Intact Female,Spayed Female,True
114,6 months,1 year,A772747,Dog,Pit Bull/Australian Cattle Dog,0.5,1.1,White,2018-05-29 16:53:00,2019-01-07 13:41:00,...,222,Thaxton And Sassman in Austin (TX),Normal,Stray,Lamar,,Adoption,Neutered Male,Neutered Male,True
115,1 year,1 year,A772747,Dog,Pit Bull/Australian Cattle Dog,1.1,1.3,White,2019-01-07 13:41:00,2019-02-28 14:43:00,...,52,Thaxton And Sassman in Austin (TX),Normal,Stray,Lamar,,Adoption,Neutered Male,Neutered Male,True
116,1 year,2 years,A772747,Dog,Pit Bull/Australian Cattle Dog,1.1,2.9,White,2019-01-07 13:41:00,2020-10-04 15:46:00,...,636,Thaxton And Sassman in Austin (TX),Normal,Stray,Lamar,,Rto-Adopt,Neutered Male,Neutered Male,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159672,2 years,2 years,A828706,Dog,Australian Shepherd/Border Collie,2.0,2.0,Black/White,2021-01-24 13:31:00,2021-01-30 13:50:00,...,6,9619 Braeburn Glen in Austin (TX),Normal,Stray,*Belvidere,,Adoption,Intact Male,Neutered Male,True
159673,2 years,2 years,A828709,Dog,Australian Cattle Dog/Great Pyrenees,2.0,2.0,Tan/White,2021-01-24 15:21:00,2021-01-30 14:46:00,...,5,3302 Caleb Drive in Austin (TX),Normal,Stray,*Ripley,,Adoption,Intact Male,Neutered Male,True
159674,2 years,2 years,A824483,Dog,Labrador Retriever/Pit Bull,2.0,2.0,Brown Brindle/White,2020-10-19 15:39:00,2020-10-26 19:31:00,...,7,15405 Long Vista Drive in Austin (TX),Normal,Stray,Ichabod,,Adoption,Intact Male,Neutered Male,True
159695,1 year,1 year,A828879,Dog,Labrador Retriever/Pit Bull,1.0,1.0,Black/Brown,2021-01-27 18:43:00,2021-01-28 15:23:00,...,0,8220 West Sh 71 in Austin (TX),Normal,Stray,Zoe,,Return to Owner,Intact Female,Intact Female,True


In [149]:
# new data frame with split value columns 
new_col = aus["Color"].str.split("/", n = 1, expand = True) 
# making separate first name column from new data frame 
aus["Color_A"]= new_col[0]  
# making separate last name column from new data frame 
aus["Color_B"]= new_col[1] 

NameError: name 'aus' is not defined

In [141]:
df['Color']

0                    Tricolor
1                 White/Liver
2                 Sable/White
3                      Calico
4                    Tan/Gray
                 ...         
159723    Brown Brindle/White
159724    Brown Brindle/White
159725    Brown Brindle/White
159726                  Black
159727            Black/White
Name: Color, Length: 159727, dtype: object

In [34]:
df.sort_index(axis=1,inplace=True)

In [187]:
dog = df[df.Animal_Type=="Dog"]

In [188]:
dog.Outcome_Type.value_counts()

Adoption           47176
Return to Owner    33052
Transfer           17768
Euthanasia          2080
Rto-Adopt           1152
Died                 280
Missing               51
Disposal              39
Name: Outcome_Type, dtype: int64

In [None]:
dog

In [191]:
sorted(dog.Calc_Out_Age.unique())

[-3.1,
 -2.2,
 -1.7,
 -1.4,
 -1.1,
 -1.0,
 -0.8,
 -0.6,
 -0.3,
 0.0,
 0.1,
 0.2,
 0.3,
 0.4,
 0.5,
 0.6,
 0.7,
 0.8,
 0.9,
 1.0,
 1.1,
 1.2,
 1.3,
 1.4,
 1.5,
 1.6,
 1.7,
 1.8,
 1.9,
 2.0,
 2.1,
 2.2,
 2.3,
 2.4,
 2.5,
 2.6,
 2.7,
 2.8,
 2.9,
 3.0,
 3.1,
 3.2,
 3.3,
 3.4,
 3.5,
 3.6,
 3.7,
 3.8,
 3.9,
 4.0,
 4.1,
 4.2,
 4.3,
 4.4,
 4.5,
 4.6,
 4.7,
 4.8,
 4.9,
 5.0,
 5.1,
 5.2,
 5.3,
 5.4,
 5.5,
 5.6,
 5.7,
 5.8,
 5.9,
 6.0,
 6.1,
 6.2,
 6.3,
 6.4,
 6.5,
 6.6,
 6.7,
 6.8,
 6.9,
 7.0,
 7.1,
 7.2,
 7.3,
 7.4,
 7.5,
 7.6,
 7.7,
 7.8,
 7.9,
 8.0,
 8.1,
 8.2,
 8.3,
 8.4,
 8.5,
 8.6,
 8.7,
 8.8,
 8.9,
 9.0,
 9.1,
 9.2,
 9.3,
 9.4,
 9.5,
 9.6,
 9.7,
 9.8,
 9.9,
 10.0,
 10.1,
 10.2,
 10.3,
 10.4,
 10.5,
 10.6,
 10.7,
 10.8,
 10.9,
 11.0,
 11.1,
 11.2,
 11.3,
 11.4,
 11.5,
 11.6,
 11.7,
 11.8,
 11.9,
 12.0,
 12.1,
 12.2,
 12.3,
 12.4,
 12.5,
 12.6,
 12.7,
 12.8,
 12.9,
 13.0,
 13.1,
 13.2,
 13.3,
 13.4,
 13.5,
 13.6,
 13.7,
 13.8,
 13.9,
 14.0,
 14.1,
 14.2,
 14.3,
 14.4,
 14.5,
 14.6,
 14.7,
 

In [141]:
dog.Age_upon_Intake.unique()

array(['2 years', '8 years', '11 months', '4 years', '6 years',
       '5 months', '2 months', '18 years', '1 year', '4 months',
       '1 month', '3 years', '5 years', '6 months', '7 years',
       '10 months', '12 years', '10 years', '1 week', '7 months',
       '9 years', '14 years', '9 months', '8 months', '11 years',
       '4 weeks', '3 months', '3 weeks', '0 years', '15 years', '3 days',
       '13 years', '5 weeks', '17 years', '2 days', '2 weeks', '19 years',
       '1 day', '16 years', '6 days', '5 days', '4 days', '1 weeks',
       '20 years', '-1 years', '-3 years', '23 years', '-2 years',
       '24 years'], dtype=object)

Unnamed: 0,Age_upon_Intake,Age_upon_Outcome,Animal_ID,Animal_Type,Breed,Color,DateTime_in,DateTime_out,Date_of_Birth,Found_Location,Intake_Condition,Intake_Type,Name,Outcome_Subtype,Outcome_Type,Sex_upon_Intake,Sex_upon_Outcome,Duration_Days
0,2 years,2 years,A786884,Dog,Beagle Mix,Tricolor,2019-01-03 16:19:00,2019-01-08 15:11:00,01/03/2017,2501 Magin Meadow Dr in Austin (TX),Normal,Stray,*Brock,Partner,Transfer,Neutered Male,Neutered Male,4
1,8 years,8 years,A706918,Dog,English Springer Spaniel,White/Liver,2015-07-05 12:59:00,2015-07-05 15:13:00,07/05/2007,9409 Bluegrass Dr in Austin (TX),Normal,Stray,Belle,,Return to Owner,Spayed Female,Spayed Female,0
2,11 months,1 year,A724273,Dog,Basenji Mix,Sable/White,2016-04-14 18:43:00,2016-04-21 17:17:00,04/17/2015,2818 Palomino Trail in Austin (TX),Normal,Stray,Runster,,Return to Owner,Intact Male,Neutered Male,6
4,4 years,4 years,A682524,Dog,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,2014-06-29 10:38:00,2014-07-02 14:16:00,06/29/2010,800 Grove Blvd in Austin (TX),Normal,Stray,Rio,,Return to Owner,Neutered Male,Neutered Male,3
5,2 years,2 years,A743852,Dog,Labrador Retriever Mix,Chocolate,2017-02-18 12:46:00,2017-02-21 17:44:00,02/18/2015,Austin (TX),Normal,Owner Surrender,Odin,,Return to Owner,Neutered Male,Neutered Male,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159723,2 months,3 months,A825881,Dog,Bull Terrier Mix,Brown Brindle/White,2020-11-14 13:39:00,2021-01-11 16:55:00,09/13/2020,Teri Road And Pleasant Valley Road in Austin (TX),Normal,Stray,Bosco,,Adoption,Intact Male,Neutered Male,58
159724,2 months,3 months,A825881,Dog,Bull Terrier Mix,Brown Brindle/White,2020-11-25 15:23:00,2020-12-23 13:10:00,09/13/2020,Austin (TX),Normal,Owner Surrender,Bosco,,Adoption,Neutered Male,Neutered Male,27
159725,3 months,3 months,A825881,Dog,Bull Terrier Mix,Brown Brindle/White,2020-12-23 13:10:00,2021-01-11 16:55:00,09/13/2020,Austin (TX),Normal,Owner Surrender,Bosco,,Adoption,Neutered Male,Neutered Male,19
159726,2 years,2 years,A828968,Dog,Labrador Retriever,Black,2021-01-30 08:14:00,2021-01-30 12:11:00,01/30/2019,1111 Bastrop Highway in Austin (TX),Normal,Public Assist,Rosie,,Return to Owner,Intact Female,Intact Female,0


In [179]:
def transform_age(df, age_column):
    df = df.loc[df.loc[:, age_column] != 'NULL']
    
    range_column = age_column + '_Period Range'
    
    df[age_column + '_Periods'], df[range_column] = df.loc[:, age_column].str.split(' ').str[0].fillna(0).astype(int), df.loc[:, age_column].str.split(' ').str[1].fillna(0)
    
    
    df[range_column] = np.where(df[range_column].str.contains('day'), 1, 
                                  np.where(df[range_column].str.contains('week'), 7, 
                                           np.where(df[range_column].str.contains('month'), 30, 
                                                    np.where(df[range_column].str.contains('year'), 365, 0)))).astype(int)

    df[age_column + '_(days)'] = df[range_column] * df[age_column + '_Periods']
    df[age_column + '_(years)'] = df[age_column + '_(days)'] / 365
    
    df[age_column + '_age_group'] = pd.cut(df[age_column + '_(years)'], 10)

    return df