In [63]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from config import db_password
import psycopg2 as ps

In [64]:
df = pd.read_csv('all_records.csv')

In [65]:
df.head()

Unnamed: 0.1,Unnamed: 0,Animal ID,Name_intake,DateTime_intake,MonthYear_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex,...,beagle,terrier,boxer,poodle,rottweiler,dachshund,chihuahua,pit bull,DateTime_length,Days_length
0,0,A730601,,2016-07-07 12:11:00,07/07/2016 12:11:00 PM,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,...,0,0,0,0,0,0,0,0,0 days 20:49:00.000000000,0-7 days
1,1,A683644,*Zoey,2014-07-13 11:02:00,07/13/2014 11:02:00 AM,Austin (TX),Owner Surrender,Nursing,Dog,Intact Female,...,0,0,0,0,0,0,0,0,115 days 23:04:00.000000000,12 weeks - 6 months
2,2,A676515,Rico,2014-04-11 08:45:00,04/11/2014 08:45:00 AM,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,...,0,0,0,0,0,0,0,1,3 days 09:53:00.000000000,0-7 days
3,3,A742953,,2017-01-31 13:30:00,01/31/2017 01:30:00 PM,S Hwy 183 And Thompson Lane in Austin (TX),Stray,Normal,Dog,Intact Male,...,0,0,0,0,0,0,0,0,4 days 00:47:00.000000000,0-7 days
4,4,A679549,*Gilbert,2014-05-22 15:43:00,05/22/2014 03:43:00 PM,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,...,0,0,0,0,0,0,0,0,24 days 22:11:00.000000000,3-6 weeks


In [66]:
# Check columns
df.columns

Index(['Unnamed: 0', 'Animal ID', 'Name_intake', 'DateTime_intake',
       'MonthYear_intake', 'Found_Location', 'Intake_Type', 'IntakeCondition',
       'Animal_Type_intake', 'Sex', 'Age', 'Breed_intake', 'Color_intake',
       'Name_outcome', 'DateTime_outcome', 'MonthYear_outcome', 'Outcome_Type',
       'Outcome_Subtype', 'Sex_upon_Outcome', 'Age_upon_Outcome',
       'gender_intake', 'gender_outcome', 'fixed_intake', 'fixed_outcome',
       'fixed_changed', 'Age_Bucket', 'retriever', 'shepherd', 'beagle',
       'terrier', 'boxer', 'poodle', 'rottweiler', 'dachshund', 'chihuahua',
       'pit bull', 'DateTime_length', 'Days_length'],
      dtype='object')

In [67]:
# Dataframe null values
df.isnull().sum()

Unnamed: 0                0
Animal ID                 0
Name_intake           19484
DateTime_intake           0
MonthYear_intake          0
Found_Location            0
Intake_Type               0
IntakeCondition           0
Animal_Type_intake        0
Sex                       1
Age                       0
Breed_intake              0
Color_intake              0
Name_outcome          19484
DateTime_outcome          0
MonthYear_outcome         0
Outcome_Type              7
Outcome_Subtype       45254
Sex_upon_Outcome          4
Age_upon_Outcome         21
gender_intake          5608
gender_outcome         5611
fixed_intake              1
fixed_outcome             4
fixed_changed             0
Age_Bucket                0
retriever                 0
shepherd                  0
beagle                    0
terrier                   0
boxer                     0
poodle                    0
rottweiler                0
dachshund                 0
chihuahua                 0
pit bull            

In [68]:
# Drop unnecessary columns
# Drop Name_intake since there is a Intake_type column
# Drop all single breed type column since we need to do our own encoding for breed_intake
df=df.drop(columns=['Unnamed: 0','Name_outcome','MonthYear_intake','MonthYear_outcome',
                    'gender_intake','gender_outcome','fixed_intake','fixed_outcome','Days_length','Outcome_Subtype',
                   'retriever','shepherd','beagle','terrier','boxer','poodle','rottweiler','dachshund','chihuahua',
                   'pit bull','Age','Age_upon_Outcome'])
df.head()

Unnamed: 0,Animal ID,Name_intake,DateTime_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex,Breed_intake,Color_intake,DateTime_outcome,Outcome_Type,Sex_upon_Outcome,fixed_changed,Age_Bucket,DateTime_length
0,A730601,,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Blue Tabby,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0 days 20:49:00.000000000
1,A683644,*Zoey,2014-07-13 11:02:00,Austin (TX),Owner Surrender,Nursing,Dog,Intact Female,Border Collie Mix,Brown/White,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115 days 23:04:00.000000000
2,A676515,Rico,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,Pit Bull Mix,White/Brown,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3 days 09:53:00.000000000
3,A742953,,2017-01-31 13:30:00,S Hwy 183 And Thompson Lane in Austin (TX),Stray,Normal,Dog,Intact Male,Saluki,Sable/Cream,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4 days 00:47:00.000000000
4,A679549,*Gilbert,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Black/White,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24 days 22:11:00.000000000


## Name_intake_column

In [69]:
# Change Name_Intake to Yes or No
df.Name_intake = df.Name_intake.apply(lambda x: 'No' if pd.isnull(x) else 'Yes')
df.Name_intake.value_counts()

Yes    57493
No     19484
Name: Name_intake, dtype: int64

In [70]:
df.isnull().sum()

Animal ID             0
Name_intake           0
DateTime_intake       0
Found_Location        0
Intake_Type           0
IntakeCondition       0
Animal_Type_intake    0
Sex                   1
Breed_intake          0
Color_intake          0
DateTime_outcome      0
Outcome_Type          7
Sex_upon_Outcome      4
fixed_changed         0
Age_Bucket            0
DateTime_length       0
dtype: int64

In [71]:
df=df.dropna()

In [72]:
df.isnull().sum()

Animal ID             0
Name_intake           0
DateTime_intake       0
Found_Location        0
Intake_Type           0
IntakeCondition       0
Animal_Type_intake    0
Sex                   0
Breed_intake          0
Color_intake          0
DateTime_outcome      0
Outcome_Type          0
Sex_upon_Outcome      0
fixed_changed         0
Age_Bucket            0
DateTime_length       0
dtype: int64

In [73]:
# Check for duplicates rows
count=0
for i in df['Animal ID'].duplicated():
    if i is True:
        count+=1
count

19762

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76968 entries, 0 to 76976
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Animal ID           76968 non-null  object
 1   Name_intake         76968 non-null  object
 2   DateTime_intake     76968 non-null  object
 3   Found_Location      76968 non-null  object
 4   Intake_Type         76968 non-null  object
 5   IntakeCondition     76968 non-null  object
 6   Animal_Type_intake  76968 non-null  object
 7   Sex                 76968 non-null  object
 8   Breed_intake        76968 non-null  object
 9   Color_intake        76968 non-null  object
 10  DateTime_outcome    76968 non-null  object
 11  Outcome_Type        76968 non-null  object
 12  Sex_upon_Outcome    76968 non-null  object
 13  fixed_changed       76968 non-null  int64 
 14  Age_Bucket          76968 non-null  object
 15  DateTime_length     76968 non-null  object
dtypes: int64(1), object(15

In [75]:
# Drop duplicates
df=df.drop_duplicates(subset='Animal ID')

In [76]:
df.head()

Unnamed: 0,Animal ID,Name_intake,DateTime_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex,Breed_intake,Color_intake,DateTime_outcome,Outcome_Type,Sex_upon_Outcome,fixed_changed,Age_Bucket,DateTime_length
0,A730601,No,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Blue Tabby,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0 days 20:49:00.000000000
1,A683644,Yes,2014-07-13 11:02:00,Austin (TX),Owner Surrender,Nursing,Dog,Intact Female,Border Collie Mix,Brown/White,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115 days 23:04:00.000000000
2,A676515,Yes,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,Pit Bull Mix,White/Brown,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3 days 09:53:00.000000000
3,A742953,No,2017-01-31 13:30:00,S Hwy 183 And Thompson Lane in Austin (TX),Stray,Normal,Dog,Intact Male,Saluki,Sable/Cream,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4 days 00:47:00.000000000
4,A679549,Yes,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Black/White,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24 days 22:11:00.000000000


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57206 entries, 0 to 76975
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Animal ID           57206 non-null  object
 1   Name_intake         57206 non-null  object
 2   DateTime_intake     57206 non-null  object
 3   Found_Location      57206 non-null  object
 4   Intake_Type         57206 non-null  object
 5   IntakeCondition     57206 non-null  object
 6   Animal_Type_intake  57206 non-null  object
 7   Sex                 57206 non-null  object
 8   Breed_intake        57206 non-null  object
 9   Color_intake        57206 non-null  object
 10  DateTime_outcome    57206 non-null  object
 11  Outcome_Type        57206 non-null  object
 12  Sex_upon_Outcome    57206 non-null  object
 13  fixed_changed       57206 non-null  int64 
 14  Age_Bucket          57206 non-null  object
 15  DateTime_length     57206 non-null  object
dtypes: int64(1), object(15

## Outcome_Type column

In [78]:
df.head()

Unnamed: 0,Animal ID,Name_intake,DateTime_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex,Breed_intake,Color_intake,DateTime_outcome,Outcome_Type,Sex_upon_Outcome,fixed_changed,Age_Bucket,DateTime_length
0,A730601,No,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Blue Tabby,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0 days 20:49:00.000000000
1,A683644,Yes,2014-07-13 11:02:00,Austin (TX),Owner Surrender,Nursing,Dog,Intact Female,Border Collie Mix,Brown/White,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115 days 23:04:00.000000000
2,A676515,Yes,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,Pit Bull Mix,White/Brown,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3 days 09:53:00.000000000
3,A742953,No,2017-01-31 13:30:00,S Hwy 183 And Thompson Lane in Austin (TX),Stray,Normal,Dog,Intact Male,Saluki,Sable/Cream,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4 days 00:47:00.000000000
4,A679549,Yes,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Black/White,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24 days 22:11:00.000000000


In [79]:
df.Outcome_Type.value_counts()

Adoption           22988
Transfer           18460
Return to Owner     9806
Euthanasia          5100
Died                 533
Disposal             254
Missing               37
Rto-Adopt             15
Relocate              13
Name: Outcome_Type, dtype: int64

In [80]:
# Combine Died & Disposal Outcomes
df.loc[df.Outcome_Type == 'Disposal', 'Outcome_Type'] = 'Died'
df.Outcome_Type.value_counts()

Adoption           22988
Transfer           18460
Return to Owner     9806
Euthanasia          5100
Died                 787
Missing               37
Rto-Adopt             15
Relocate              13
Name: Outcome_Type, dtype: int64

In [81]:
# Combine Transfer & Relocate Outcomes
df.loc[df.Outcome_Type == 'Relocate', 'Outcome_Type'] = 'Transfer'
df.Outcome_Type.value_counts()

Adoption           22988
Transfer           18473
Return to Owner     9806
Euthanasia          5100
Died                 787
Missing               37
Rto-Adopt             15
Name: Outcome_Type, dtype: int64

In [82]:
#### Combine Rto-Adopt & Return to Owner
df.loc[df.Outcome_Type == 'Rto-Adopt', 'Outcome_Type'] = 'Return to Owner'
df.Outcome_Type.value_counts()

Adoption           22988
Transfer           18473
Return to Owner     9821
Euthanasia          5100
Died                 787
Missing               37
Name: Outcome_Type, dtype: int64

## DateTime_intake column

In [83]:
# Update DateTime_intake to datatime datatype
df.DateTime_intake = pd.to_datetime(df.DateTime_intake, format='%Y-%m-%d')
df.head()

Unnamed: 0,Animal ID,Name_intake,DateTime_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex,Breed_intake,Color_intake,DateTime_outcome,Outcome_Type,Sex_upon_Outcome,fixed_changed,Age_Bucket,DateTime_length
0,A730601,No,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Blue Tabby,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0 days 20:49:00.000000000
1,A683644,Yes,2014-07-13 11:02:00,Austin (TX),Owner Surrender,Nursing,Dog,Intact Female,Border Collie Mix,Brown/White,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115 days 23:04:00.000000000
2,A676515,Yes,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,Pit Bull Mix,White/Brown,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3 days 09:53:00.000000000
3,A742953,No,2017-01-31 13:30:00,S Hwy 183 And Thompson Lane in Austin (TX),Stray,Normal,Dog,Intact Male,Saluki,Sable/Cream,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4 days 00:47:00.000000000
4,A679549,Yes,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Black/White,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24 days 22:11:00.000000000


## DateTime_outcome column

In [84]:
# Update DateTime_outcome to datatime datatype
df.DateTime_outcome = pd.to_datetime(df.DateTime_outcome, format='%Y-%m-%d')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57206 entries, 0 to 76975
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Animal ID           57206 non-null  object        
 1   Name_intake         57206 non-null  object        
 2   DateTime_intake     57206 non-null  datetime64[ns]
 3   Found_Location      57206 non-null  object        
 4   Intake_Type         57206 non-null  object        
 5   IntakeCondition     57206 non-null  object        
 6   Animal_Type_intake  57206 non-null  object        
 7   Sex                 57206 non-null  object        
 8   Breed_intake        57206 non-null  object        
 9   Color_intake        57206 non-null  object        
 10  DateTime_outcome    57206 non-null  datetime64[ns]
 11  Outcome_Type        57206 non-null  object        
 12  Sex_upon_Outcome    57206 non-null  object        
 13  fixed_changed       57206 non-null  int64     

## DateTime_length column

In [85]:
# Create a new column and calculate the length of stay
df['datetime_length'] = df.DateTime_outcome - df.DateTime_intake
df['datetime_length'] = df.datetime_length / np.timedelta64(1, 'D')

In [86]:
# Check values for datetime_length column
df['datetime_length'].value_counts()

0.009722     72
0.005556     64
0.013194     64
0.031250     64
0.007639     63
             ..
3.518750      1
56.763889     1
8.847222      1
25.955556     1
95.159028     1
Name: datetime_length, Length: 25060, dtype: int64

## delete all negative values

In [87]:
df.drop(df[df['datetime_length'] < 0].index, inplace=True)

In [88]:
# Drop original DateTime_length column
df.drop(['DateTime_length'], axis=1, inplace=True)

In [89]:
# Rename calculated length of stay column
df.rename(columns = {'datetime_length':'DateTime_length'}, inplace = True)

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55806 entries, 0 to 76975
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Animal ID           55806 non-null  object        
 1   Name_intake         55806 non-null  object        
 2   DateTime_intake     55806 non-null  datetime64[ns]
 3   Found_Location      55806 non-null  object        
 4   Intake_Type         55806 non-null  object        
 5   IntakeCondition     55806 non-null  object        
 6   Animal_Type_intake  55806 non-null  object        
 7   Sex                 55806 non-null  object        
 8   Breed_intake        55806 non-null  object        
 9   Color_intake        55806 non-null  object        
 10  DateTime_outcome    55806 non-null  datetime64[ns]
 11  Outcome_Type        55806 non-null  object        
 12  Sex_upon_Outcome    55806 non-null  object        
 13  fixed_changed       55806 non-null  int64     

In [91]:
df.head(10)

Unnamed: 0,Animal ID,Name_intake,DateTime_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex,Breed_intake,Color_intake,DateTime_outcome,Outcome_Type,Sex_upon_Outcome,fixed_changed,Age_Bucket,DateTime_length
0,A730601,No,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Blue Tabby,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0.867361
1,A683644,Yes,2014-07-13 11:02:00,Austin (TX),Owner Surrender,Nursing,Dog,Intact Female,Border Collie Mix,Brown/White,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115.961111
2,A676515,Yes,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,Pit Bull Mix,White/Brown,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3.411806
3,A742953,No,2017-01-31 13:30:00,S Hwy 183 And Thompson Lane in Austin (TX),Stray,Normal,Dog,Intact Male,Saluki,Sable/Cream,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4.032639
4,A679549,Yes,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Black/White,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24.924306
5,A683798,Yes,2016-07-21 12:16:00,3118 Windsor Rd in Austin (TX),Stray,Normal,Cat,Spayed Female,Domestic Medium Hair Mix,White/Black,2016-10-18 10:55:00,Adoption,Spayed Female,0,1-3 years,88.94375
9,A683656,No,2014-07-13 13:20:00,8238 Research Blvd in Austin (TX),Stray,Normal,Cat,Intact Male,Snowshoe Mix,Lynx Point,2014-07-17 16:57:00,Adoption,Neutered Male,1,1-6 months,4.150694
10,A709749,Yes,2015-08-12 18:29:00,4800 Weletka Dr in Austin (TX),Stray,Normal,Cat,Intact Female,Domestic Shorthair Mix,Calico,2015-09-22 12:49:00,Transfer,Spayed Female,1,1-3 years,40.763889
11,A692161,Yes,2014-11-15 15:18:00,Avenue G/42Nd in Austin (TX),Owner Surrender,Normal,Dog,Intact Male,Pit Bull Mix,Brown/White,2014-11-21 18:55:00,Adoption,Neutered Male,1,1-6 months,6.150694
12,A733551,Yes,2016-08-23 14:35:00,183 And Cameron in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Brown Tabby/White,2016-09-01 00:00:00,Transfer,Intact Male,0,1-6 months,8.392361


## Intake condition column

In [92]:
df.IntakeCondition.value_counts()

Normal      48329
Injured      3022
Sick         2456
Nursing      1523
Aged          250
Other         112
Feral          72
Pregnant       42
Name: IntakeCondition, dtype: int64

In [93]:
# Combine Aged and Feral with Other for IntakeCondition
df.loc[(df.IntakeCondition == 'Aged') | (df.IntakeCondition == 'Feral'), 'IntakeCondition'] = 'Other'

In [94]:
# Combine Injured and Sick for Intake Condition
df.loc[(df.IntakeCondition == 'Injured') | (df.IntakeCondition == 'Sick'), 'IntakeCondition'] = 'Medical'

In [95]:
# Combine Pregnant and Nursing for  IntakeCondition
df.loc[(df.IntakeCondition == 'Nursing') | (df.IntakeCondition == 'Pregnant'), 'IntakeCondition' ] = 'Maternity'
df.IntakeCondition.value_counts()

Normal       48329
Medical       5478
Maternity     1565
Other          434
Name: IntakeCondition, dtype: int64

## Animal_Type_intake column

In [96]:
df.Animal_Type_intake.value_counts()

Dog          30040
Cat          22059
Other         3449
Bird           250
Livestock        8
Name: Animal_Type_intake, dtype: int64

In [97]:
# Combine Bird and Livestock with Other for Animal_Type_intake
df.loc[(df.Animal_Type_intake == 'Bird') | (df.Animal_Type_intake == 'Livestock'), 'Animal_Type_intake'] = 'Other'
df.Animal_Type_intake.value_counts()

Dog      30040
Cat      22059
Other     3707
Name: Animal_Type_intake, dtype: int64

## Sex column

In [98]:
df.Sex.value_counts()

Intact Male      18780
Intact Female    17711
Neutered Male     7301
Spayed Female     6476
Unknown           5538
Name: Sex, dtype: int64

In [99]:
df.rename(columns={'Sex': 'Sex_Intake'}, inplace=True)

## Sex_upon_Outcome

In [100]:
df.Sex_upon_Outcome.value_counts()

Neutered Male    18906
Spayed Female    17201
Intact Male       7175
Intact Female     6986
Unknown           5538
Name: Sex_upon_Outcome, dtype: int64

## Found_Location column

In [101]:
counts=df['Found_Location'].value_counts()
counts

Austin (TX)                                       8896
Travis (TX)                                        587
7201 Levander Loop in Austin (TX)                  406
Del Valle (TX)                                     275
Outside Jurisdiction                               271
                                                  ... 
E 7Th St & N Pleasant Valley Rd in Austin (TX)       1
Imperial Drive in Austin (TX)                        1
Airport And Manor in Austin (TX)                     1
25715 Cliff Cir in Austin (TX)                       1
2301 Wagon Crossing in Austin (TX)                   1
Name: Found_Location, Length: 27060, dtype: int64

In [102]:
new_location=[]
for i in df['Found_Location']:
    if ' in' in i and i.split()[0].isdigit() == True:
            i=i
    else:
        i=np.nan
    new_location.append(i)

In [103]:
new_location

['1109 Shady Ln in Austin (TX)',
 nan,
 '615 E. Wonsley in Austin (TX)',
 nan,
 '124 W Anderson in Austin (TX)',
 '3118 Windsor Rd in Austin (TX)',
 '8238 Research Blvd in Austin (TX)',
 '4800 Weletka Dr in Austin (TX)',
 nan,
 '183 And Cameron in Austin (TX)',
 '6808 S Ih 35 Frontage Rd in Austin (TX)',
 '8413 Danville Dr in Austin (TX)',
 '11602 Gunsmoke Circle in Austin (TX)',
 '2940 Eckert St in Austin (TX)',
 nan,
 nan,
 nan,
 '1506 Thorneridge Road in Austin (TX)',
 nan,
 '1830 W Rundberg Ln in Austin (TX)',
 nan,
 nan,
 '9308 N Lamar in Austin (TX)',
 nan,
 nan,
 '1128 Spur in Austin (TX)',
 nan,
 '12505 Rampart St in Austin (TX)',
 '4502 Hudson Bend Rd in Travis (TX)',
 nan,
 '609 W Lynn St #1 in Austin (TX)',
 '704 Hyde Park Ct in Austin (TX)',
 '13096 N 183 in Austin (TX)',
 '701 W Longspur in Austin (TX)',
 nan,
 nan,
 '2203 Singletree Ave in Austin (TX)',
 nan,
 '11417 Trails End in Travis (TX)',
 nan,
 '7607 Glen Hill Cove in Austin (TX)',
 '2202 Galway St in Austin (TX)',

In [104]:
df['Found_Location']=new_location
df.head()

Unnamed: 0,Animal ID,Name_intake,DateTime_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex_Intake,Breed_intake,Color_intake,DateTime_outcome,Outcome_Type,Sex_upon_Outcome,fixed_changed,Age_Bucket,DateTime_length
0,A730601,No,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Blue Tabby,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0.867361
1,A683644,Yes,2014-07-13 11:02:00,,Owner Surrender,Maternity,Dog,Intact Female,Border Collie Mix,Brown/White,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115.961111
2,A676515,Yes,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,Pit Bull Mix,White/Brown,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3.411806
3,A742953,No,2017-01-31 13:30:00,,Stray,Normal,Dog,Intact Male,Saluki,Sable/Cream,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4.032639
4,A679549,Yes,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,Domestic Shorthair Mix,Black/White,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24.924306


In [105]:
df['Found_Location'].value_counts()

7201 Levander Loop in Austin (TX)            406
4434 Frontier Trl in Austin (TX)             137
124 W Anderson Ln in Austin (TX)             131
12034 Research in Austin (TX)                 85
12034 Research Blvd in Austin (TX)            85
                                            ... 
10017 Wind Dunes Dr in Austin (TX)             1
6600 Antelope Cir in Austin (TX)               1
9100 Waterford Centre Blvd in Austin (TX)      1
14905 Mossycup Ln in Austin (TX)               1
12901 N Ih-35 in Austin (TX)                   1
Name: Found_Location, Length: 18559, dtype: int64

In [106]:
df.isnull().sum()

Animal ID                 0
Name_intake               0
DateTime_intake           0
Found_Location        22200
Intake_Type               0
IntakeCondition           0
Animal_Type_intake        0
Sex_Intake                0
Breed_intake              0
Color_intake              0
DateTime_outcome          0
Outcome_Type              0
Sex_upon_Outcome          0
fixed_changed             0
Age_Bucket                0
DateTime_length           0
dtype: int64

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55806 entries, 0 to 76975
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Animal ID           55806 non-null  object        
 1   Name_intake         55806 non-null  object        
 2   DateTime_intake     55806 non-null  datetime64[ns]
 3   Found_Location      33606 non-null  object        
 4   Intake_Type         55806 non-null  object        
 5   IntakeCondition     55806 non-null  object        
 6   Animal_Type_intake  55806 non-null  object        
 7   Sex_Intake          55806 non-null  object        
 8   Breed_intake        55806 non-null  object        
 9   Color_intake        55806 non-null  object        
 10  DateTime_outcome    55806 non-null  datetime64[ns]
 11  Outcome_Type        55806 non-null  object        
 12  Sex_upon_Outcome    55806 non-null  object        
 13  fixed_changed       55806 non-null  int64     

## Age Bucket column

In [108]:
df.Age_Bucket.value_counts()

1-3 years           22095
1-6 months          14259
1-6 weeks            6352
4-6 years            5507
7+ years             3800
7-12 months          2618
Less than 1 week     1175
Name: Age_Bucket, dtype: int64

## Intake_Type column

In [109]:
df.Intake_Type.value_counts()

Stray                 40557
Owner Surrender        9080
Public Assist          3188
Wildlife               2750
Euthanasia Request      231
Name: Intake_Type, dtype: int64

## Breed_Intake column

In [110]:
count_breed=df.Breed_intake.value_counts()
count_breed

Domestic Shorthair Mix                17520
Pit Bull Mix                           3791
Chihuahua Shorthair Mix                3448
Labrador Retriever Mix                 2951
Domestic Medium Hair Mix               1725
                                      ...  
German Shepherd/Smooth Fox Terrier        1
Rottweiler/English Bulldog                1
Miniature Poodle/Beagle                   1
Australian Kelpie/Whippet                 1
Maltese/Papillon                          1
Name: Breed_intake, Length: 1929, dtype: int64

In [111]:
for i in count_breed.index.tolist():
    print(i)

Domestic Shorthair Mix
Pit Bull Mix
Chihuahua Shorthair Mix
Labrador Retriever Mix
Domestic Medium Hair Mix
German Shepherd Mix
Bat Mix
Domestic Longhair Mix
Siamese Mix
Australian Cattle Dog Mix
Bat
Dachshund Mix
Miniature Poodle Mix
Border Collie Mix
Boxer Mix
Raccoon Mix
Australian Shepherd Mix
Rat Terrier Mix
Jack Russell Terrier Mix
Domestic Shorthair
Catahoula Mix
Yorkshire Terrier Mix
Miniature Schnauzer Mix
Chihuahua Longhair Mix
Beagle Mix
Siberian Husky Mix
Cairn Terrier Mix
Great Pyrenees Mix
Rottweiler Mix
Rabbit Sh Mix
Shih Tzu Mix
Pointer Mix
American Bulldog Mix
Chihuahua Shorthair/Dachshund
Raccoon
Staffordshire Mix
Australian Kelpie Mix
Plott Hound Mix
Black Mouth Cur Mix
Chihuahua Shorthair
Pit Bull
Labrador Retriever
Labrador Retriever/Pit Bull
American Staffordshire Terrier Mix
Anatol Shepherd Mix
Opossum Mix
Dachshund/Chihuahua Shorthair
German Shepherd
Snowshoe Mix
Golden Retriever Mix
Queensland Heeler Mix
Miniature Pinscher Mix
Maltese Mix
Chow Chow Mix
Domestic

Redbone Hound/Boxer
Soft Coated Wheaten Terrier/Chihuahua Shorthair
Yorkshire Terrier/Yorkshire Terrier
Whippet/Whippet
Border Terrier/Chihuahua Longhair
Chihuahua Longhair/Chihuahua Shorthair
Golden Retriever/Akita
Bruss Griffon/Maltese
Dutch/Rabbit Sh
Australian Kelpie/Rat Terrier
Hawk Mix
Boykin Span/Dachshund
German Shepherd/English Coonhound
Australian Cattle Dog/Rat Terrier
Cairn Terrier/Dachshund Wirehair
Redbone Hound/Australian Cattle Dog
Bernese Mountain Dog/Rottweiler
Boxer/Queensland Heeler
Mastiff/German Shepherd
Chihuahua Longhair/Italian Greyhound
Cardigan Welsh Corgi/Rat Terrier
Chinese Sharpei/Beagle
Chihuahua Shorthair/Cirneco
Yorkshire Terrier/Soft Coated Wheaten Terrier
Border Collie/Bull Terrier
Queensland Heeler/Basset Hound
Harrier/Pointer
American Bulldog/Chinese Sharpei
Papillon/Pomeranian
Plott Hound/Bull Terrier
Cavalier Span/Border Collie
Whippet/Anatol Shepherd
Vizsla/German Shepherd
American Eskimo/Alaskan Husky
Afghan Hound Mix
Beagle/Parson Russell Terri

In [112]:
# Find Mix breeds
new_breed=[]
for i in df['Breed_intake']:
    if 'Mix' in i:
        i='Mix'
    elif "/" in i:
        i="Mix"
    elif 'Chihuahua' in i:
        i = 'Chihuahua'
    elif 'Retriever' in i or i=='Chesa Bay Retr':
        i = 'Retriever Breeds'
    elif 'Bull' in i or 'Staffordshire' in i or i=='Chinese Sharpei' or i=='Dogo Argentino' or i=='Boxer':
        i = 'Bully Breeds'
    elif 'Terrier' in i:
        i= 'Terrier Breeds'
    elif i=='Bat' or i=='Raccoon' or i=='Opossum' or i=='Duck' or i=='Fox' or i=='Grackle' or i=='Hawk' or i=='Coyote' or i=='Pigeon' or i=='Dove' or i=='Armadillo' or i=='Owl' or i=='Skunk' or i=='Squirrel' or i=='Mockingbird' or i=='Heron' or i=='Sparrow':
        i = 'Wildlife'
    elif i=='German Shepherd' or i=='Belgian Malinois':
        i = 'German Shepherd'
    elif 'Pointer' in i or 'Span' in i or i=='Vizsla' or i=='Brittany' or i=='Shiba Inu' or i=='Dalmatian' or i=='Standard Poodle' or i=='Carolina Dog' or i=='Weimaraner':
        i = 'Sporting Breeds'
    elif 'hound' in i or 'Hound' in i or i=='Beagle' or i=='Saluki' or 'Coon' in i or i=='Catahoula' or i=='Black Mouth Cur' or i=='Harrier' or i=='Blue Lacy' or i=='Treeing Tennesse Brindle' or i=='Whippet':
        i = 'Hound Breeds'
    elif 'Husky' in i or 'Malamute' in i or 'Eskimo' in i or i=='Finnish Spitz' or i=='Samoyed' or i=='Keeshond' or i=='Jindo':
        i = 'Husky Breeds'
    elif i=='Shih Tzu' or i=='Miniature Poodle' or i=='Miniature Schnauzer' or i=='Maltese' or i=='Pomeranian' or i=='Lhasa Apso' or i=='Toy Poodle' or i=='Pekingese' or i=='Bichon Frise' or i=='Cavalier Span' or i=='West Highland' or i=='Papillon' or i=='Havanese' or i=='Japanese Chin' or i=='Dandie Dinmont' or i=='Bruss Griffon' or i=='Coton De Tulear' or 'Dachshund' in i or i=='French Bulldog' or i=='Pug' or i=='Miniature Pinscher' or i=='Pbgv':
        i = 'Toy Breeds'
    elif 'Australian' in i or 'Collie' in i or 'Heeler' in i or 'Sheepdog' in i or i=='English Shepherd' or i=='Dutch Shepherd' or 'Corgi' in i or i=='Beauceron' or i=='Hovawart':
        i = 'Herding Breeds'
    elif i=='Chinese Sharpei' or i=='Chow Chow' or i=='Standard Schnauzer' or i=='Basenji' or i=='Rottweiler' or i=='Doberman Pinsch':
        i = 'Working Breeds'
    elif i=='Great Pyrenees' or i=='Great Dane' or i=='Anatol Shepherd' or i=='Cane Corso' or i=='Rhod Ridgeback' or i=='Akita' or i=='Mastiff' or i=='Presa Canario' or 'Bernese' in i or i=='Burmese' or 'Bernard' in i or i=='Leonberger' or i=='Greater Swiss Mountain Dog' or i=='Boerboel'or i=='Landseer':
        i = 'X Large Breeds'
    elif i=='Domestic Shorthair':
        i = 'Domestic Shorthair'
    elif i=='Domestic Medium Hair':
        i = 'Domestic Medium Hair'
    elif i=='Persian' or i=='Siamese' or i=='Domestic Longhair' or i=='Persian' or i=='Himalayan' or i=='Russian Blue' or i=='Bengal' or i=='Devon Rex' or i=='Sphynx' or i=='British Shorthair' or i=='Manx':
        i = 'Other Cat'
    else:
        i='Other'
    new_breed.append(i)
new_breed

['Mix',
 'Mix',
 'Mix',
 'Hound Breeds',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Toy Breeds',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Wildlife',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Hound Breeds',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 'Mix',
 '

In [113]:
df["new_breed"]=new_breed

In [114]:
breed_counts=df['new_breed'].value_counts()
breed_counts

Mix                     51933
Wildlife                 1060
Toy Breeds                446
Bully Breeds              318
Domestic Shorthair        300
Other                     186
Terrier Breeds            173
Retriever Breeds          172
Herding Breeds            167
Chihuahua                 166
Hound Breeds              153
German Shepherd           138
Domestic Medium Hair      109
Working Breeds            107
Other Cat                 104
X Large Breeds             99
Sporting Breeds            96
Husky Breeds               79
Name: new_breed, dtype: int64

In [115]:
# drop and rename
df=df.drop(columns=['Breed_intake'])
df=df.rename(columns={"new_breed": "Breed_Type"})
df.head()

Unnamed: 0,Animal ID,Name_intake,DateTime_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex_Intake,Color_intake,DateTime_outcome,Outcome_Type,Sex_upon_Outcome,fixed_changed,Age_Bucket,DateTime_length,Breed_Type
0,A730601,No,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,Blue Tabby,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0.867361,Mix
1,A683644,Yes,2014-07-13 11:02:00,,Owner Surrender,Maternity,Dog,Intact Female,Brown/White,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115.961111,Mix
2,A676515,Yes,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,White/Brown,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3.411806,Mix
3,A742953,No,2017-01-31 13:30:00,,Stray,Normal,Dog,Intact Male,Sable/Cream,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4.032639,Hound Breeds
4,A679549,Yes,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,Black/White,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24.924306,Mix


## Color_intake column

In [116]:
color_counts=df['Color_intake'].value_counts()
color_counts

Black/White                5698
Black                      4720
Brown Tabby                3360
Brown                      2583
White                      1964
                           ... 
Torbie/Calico                 1
Chocolate/Gold                1
Cream/Cream                   1
Tortie Point/Lynx Point       1
Blue/Yellow                   1
Name: Color_intake, Length: 478, dtype: int64

In [117]:
for i in color_counts.index:
    print(i)

Black/White
Black
Brown Tabby
Brown
White
Brown Tabby/White
Orange Tabby
Brown/White
Tan/White
Tricolor
White/Black
Blue/White
Tan
Black/Tan
Tortie
White/Brown
Black/Brown
Calico
Blue
Brown/Black
Blue Tabby
Brown Brindle/White
Orange Tabby/White
White/Tan
Red
Torbie
Red/White
Brown Brindle
Blue Tabby/White
Tan/Black
Chocolate/White
Cream Tabby
Yellow
Gray
Sable
Lynx Point
Cream
Buff
Seal Point
Chocolate
Gray/Black
White/Blue
Gray/White
White/Brown Tabby
White/Brown Brindle
Black/Gray
Fawn/White
Sable/White
Cream Tabby/White
White/Gray
Black Tabby
Brown/Tan
Flame Point
Blue Merle
Red/Black
Torbie/White
Fawn
Chocolate/Tan
White/Red
Black Brindle/White
Cream/White
Gold
Yellow/White
White/Tricolor
White/Orange Tabby
Black Smoke
Black/Brown Brindle
Gray Tabby
Blue Merle/White
Buff/White
Black/Tricolor
White/Cream
Lilac Point
White/Chocolate
Red/Tan
Tan/Brown
White/Orange
Brown Merle
Orange/White
Blue/Tan
White/Buff
Black Tabby/White
Tortie Point
Black Brindle
Tortie/White
Orange
Brown Merle

In [118]:
colorNew=[]
for i in df.Color_intake:
    if "/" in i:
        i="Bicolor"
    elif "Tabby" in i:
        i='Tabby'
    elif "Brindle" in i:
        i="Tabby"
    elif "Merle" in i:
        i="Merle"
    elif "Tiger" in i:
        i="Tiger"
    elif "Tortie" in i:
        i="Tabby"
    elif "Calico" in i:
        i="Tabby"
    elif "Torbie" in i:
        i="Tabby"
    elif i=="Apricot" or i=="Gold" or i=="Yellow" or i=="Fawn":
        i="Orange"
    elif "Blue" in i:
        i="Blue"
    elif "Black" in i or i=="Sable":
        i="Black"
    elif i=="Liver":
        i="Brown"
    elif "Point" in i:
        i='Point'
    elif "Tick" in i:
        i="Point"
    else:
        i=i
    colorNew.append(i)
print(set(colorNew))
print(len(set(colorNew)))

{'Bicolor', 'Tabby', 'Red', 'Tan', 'Tricolor', 'Brown', 'Gray', 'Point', 'Agouti', 'Chocolate', 'Tiger', 'Blue', 'Black', 'Merle', 'Orange', 'Pink', 'White', 'Green', 'Silver', 'Cream', 'Buff'}
21


In [119]:
df['Color_intake']=colorNew

In [120]:
color_counts=df['Color_intake'].value_counts()
color_counts

Bicolor      28302
Tabby         9887
Black         5173
Brown         2585
White         1964
Tricolor      1476
Tan           1360
Blue          1092
Point          912
Red            752
Orange         690
Gray           379
Cream          307
Buff           301
Chocolate      281
Merle          245
Pink            29
Silver          28
Green           20
Tiger           14
Agouti           9
Name: Color_intake, dtype: int64

In [121]:
replace_color=color_counts.loc[color_counts<500].index.tolist()
for i in replace_color:
    df.Color_intake = df.Color_intake.replace(i,"Other")

df.Color_intake.value_counts()

Bicolor     28302
Tabby        9887
Black        5173
Brown        2585
White        1964
Other        1613
Tricolor     1476
Tan          1360
Blue         1092
Point         912
Red           752
Orange        690
Name: Color_intake, dtype: int64

In [122]:
df.head()

Unnamed: 0,Animal ID,Name_intake,DateTime_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex_Intake,Color_intake,DateTime_outcome,Outcome_Type,Sex_upon_Outcome,fixed_changed,Age_Bucket,DateTime_length,Breed_Type
0,A730601,No,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,Tabby,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0.867361,Mix
1,A683644,Yes,2014-07-13 11:02:00,,Owner Surrender,Maternity,Dog,Intact Female,Bicolor,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115.961111,Mix
2,A676515,Yes,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,Bicolor,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3.411806,Mix
3,A742953,No,2017-01-31 13:30:00,,Stray,Normal,Dog,Intact Male,Bicolor,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4.032639,Hound Breeds
4,A679549,Yes,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,Bicolor,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24.924306,Mix


## Divide to two dataframes (intake and outcome)

In [124]:
intake_df=df[['Animal ID','DateTime_intake','Found_Location','Intake_Type','IntakeCondition',
             'Animal_Type_intake',
              'Name_intake',
             'Sex_Intake',
             'Color_intake',
             'Breed_Type']]

columns_intake=['animal_id','datetime_intake','Found_Location','intake_type','intake_condition','animal_type_intake',
        'name_intake','sex_intake','color_intake','breed_type']

intake_df.columns =columns_intake
intake_df.head()

Unnamed: 0,animal_id,datetime_intake,Found_Location,intake_type,intake_condition,animal_type_intake,name_intake,sex_intake,color_intake,breed_type
0,A730601,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,No,Intact Male,Tabby,Mix
1,A683644,2014-07-13 11:02:00,,Owner Surrender,Maternity,Dog,Yes,Intact Female,Bicolor,Mix
2,A676515,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Yes,Intact Male,Bicolor,Mix
3,A742953,2017-01-31 13:30:00,,Stray,Normal,Dog,No,Intact Male,Bicolor,Hound Breeds
4,A679549,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Yes,Intact Male,Bicolor,Mix


In [125]:
intake_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55806 entries, 0 to 76975
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   animal_id           55806 non-null  object        
 1   datetime_intake     55806 non-null  datetime64[ns]
 2   Found_Location      33606 non-null  object        
 3   intake_type         55806 non-null  object        
 4   intake_condition    55806 non-null  object        
 5   animal_type_intake  55806 non-null  object        
 6   name_intake         55806 non-null  object        
 7   sex_intake          55806 non-null  object        
 8   color_intake        55806 non-null  object        
 9   breed_type          55806 non-null  object        
dtypes: datetime64[ns](1), object(9)
memory usage: 4.7+ MB


In [126]:
outcome_df=df[['Animal ID','DateTime_outcome',
              'Outcome_Type',
              'Sex_upon_Outcome','fixed_changed','Age_Bucket','DateTime_length']]

columns_outcome=['animal_id','datetime_outcome','outcome_type',
                 'sex_upon_outcome','fixed_changed','age_bucket','datetime_length']

outcome_df.columns=columns_outcome

outcome_df.head()

Unnamed: 0,animal_id,datetime_outcome,outcome_type,sex_upon_outcome,fixed_changed,age_bucket,datetime_length
0,A730601,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0.867361
1,A683644,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115.961111
2,A676515,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3.411806
3,A742953,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4.032639
4,A679549,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24.924306


In [127]:
outcome_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55806 entries, 0 to 76975
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   animal_id         55806 non-null  object        
 1   datetime_outcome  55806 non-null  datetime64[ns]
 2   outcome_type      55806 non-null  object        
 3   sex_upon_outcome  55806 non-null  object        
 4   fixed_changed     55806 non-null  int64         
 5   age_bucket        55806 non-null  object        
 6   datetime_length   55806 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 3.4+ MB


## Connect to SQL

In [128]:
import psycopg2
connection = psycopg2.connect(
    host = 'projectanimal.c2jqqtcm0i1p.us-east-2.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = db_password,
    database='projectanimal'
    )
cursor=connection.cursor()

In [132]:
connection.rollback()

In [134]:
#creating tables
cursor.execute("""CREATE TABLE animal_intake(
 animal_id TEXT PRIMARY KEY,
 datetime_intake TIMESTAMP,
 found_location TEXT,
 intake_type TEXT,
 intake_condition TEXT,
 animal_type_intake TEXT,
 name_intake TEXT,
 sex_intake TEXT,
 color_intake TEXT,
 breed_type TEXT
)""")

cursor.execute("""CREATE TABLE animal_outcome(
 animal_id TEXT PRIMARY KEY,
 datetime_outcome TIMESTAMP,
 outcome_type TEXT,
 sex_upon_outcome TEXT,
 fixed_changed TEXT,
 age_bucket TEXT,
 datetime_length TEXT,
 FOREIGN KEY (animal_id) REFERENCES animal_intake(animal_id)
 )""")

connection.commit()

In [135]:
intake_df.to_csv('animal_intake.csv', index=False, sep=';')

In [136]:
outcome_df.to_csv('animal_outcome.csv', index=False, sep=';')

In [137]:
with open('animal_intake.csv', 'r') as row:
    next(row)# Skip the header row.
    cursor.copy_from(row, 'animal_intake', sep=';')
    
connection.commit()

In [138]:
with open('animal_outcome.csv', 'r') as row:
    next(row)# Skip the header row.
    cursor.copy_from(row, 'animal_outcome', sep=';')
    
connection.commit()

In [139]:
sql = '''
select * from animal_intake'''

pd.read_sql(sql,con=connection)

Unnamed: 0,animal_id,datetime_intake,found_location,intake_type,intake_condition,animal_type_intake,name_intake,sex_intake,color_intake,breed_type
0,A730601,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,No,Intact Male,Tabby,Mix
1,A683644,2014-07-13 11:02:00,,Owner Surrender,Maternity,Dog,Yes,Intact Female,Bicolor,Mix
2,A676515,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Yes,Intact Male,Bicolor,Mix
3,A742953,2017-01-31 13:30:00,,Stray,Normal,Dog,No,Intact Male,Bicolor,Hound Breeds
4,A679549,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Yes,Intact Male,Bicolor,Mix
...,...,...,...,...,...,...,...,...,...,...
55801,A746679,2017-04-07 09:44:00,,Stray,Normal,Cat,Yes,Spayed Female,Tabby,Mix
55802,A746725,2017-04-08 11:28:00,,Stray,Normal,Cat,No,Unknown,Bicolor,Mix
55803,A746689,2017-04-07 12:36:00,,Stray,Normal,Dog,Yes,Spayed Female,Bicolor,Mix
55804,A746466,2017-04-03 15:02:00,4858 Yager Ln in Travis (TX),Stray,Normal,Dog,Yes,Intact Male,Bicolor,Mix


In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55806 entries, 0 to 76975
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Animal ID           55806 non-null  object        
 1   Name_intake         55806 non-null  object        
 2   DateTime_intake     55806 non-null  datetime64[ns]
 3   Found_Location      33606 non-null  object        
 4   Intake_Type         55806 non-null  object        
 5   IntakeCondition     55806 non-null  object        
 6   Animal_Type_intake  55806 non-null  object        
 7   Sex_Intake          55806 non-null  object        
 8   Color_intake        55806 non-null  object        
 9   DateTime_outcome    55806 non-null  datetime64[ns]
 10  Outcome_Type        55806 non-null  object        
 11  Sex_upon_Outcome    55806 non-null  object        
 12  fixed_changed       55806 non-null  int64         
 13  Age_Bucket          55806 non-null  object    

In [141]:
df.to_csv('results.csv', index=False, sep=',')