In [1]:
# standard libraries for dataset
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import datasets with pandas
dataset_path = './best_in_show.csv'
df = pd.read_csv(dataset_path)

In [3]:
# check dataset
df.head(5)

Unnamed: 0,Dog breed,Unnamed: 1,category,Unnamed: 3,datadog score,POPULARITY IN US,POPULARITY IN US.1,"LIFETIME COST, $",5 LIFETIME COST,Unnamed: 9,...,Unnamed: 59,"Other regular costs, total per lifetime, $","total per year, $","total, per year, £","toys, presents, treats, per year, £","pet sitters, per year, £","grooming, per year, £","vet fees per year, £","kennels per year, £","one offs, $"
0,Additional info,,American Kennel Club group,,,"1- 173, 1 = most poular, US. American Kennel C...",ranking within breeds with full data only,,"in descending rank (higher score = better), n...",,...,,"if no average lifespan data, then 'no data'",converted to US $ using Google Finance,,averaged across all dog breeds. pet food exclu...,,,,,"Initial outlay: leash, collar, ID tag; food di..."
1,Border Collie,,herding,,3.64,45,39,"$20,143",48%,,...,,13095,1046,784.0,121,126.0,244.0,177.0,116.0,200
2,Border Terrier,,terrier,,3.61,80,61,"$22,638",14%,,...,,14643,1046,784.0,121,126.0,244.0,177.0,116.0,200
3,Brittany,,sporting,,3.54,30,30,"$22,589",16%,,...,,13514,1046,784.0,121,126.0,244.0,177.0,116.0,200
4,Cairn Terrier,,terrier,,3.53,59,48,"$21,992",22%,,...,,14476,1046,784.0,121,126.0,244.0,177.0,116.0,200


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 69 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   Dog breed                                                         175 non-null    object 
 1   Unnamed: 1                                                        0 non-null      float64
 2   category                                                          173 non-null    object 
 3   Unnamed: 3                                                        0 non-null      float64
 4   datadog score                                                     87 non-null     float64
 5   POPULARITY IN US                                                  173 non-null    object 
 6   POPULARITY IN US.1                                                88 non-null     object 
 7   LIFETIME COST, $                   

#### Dropping useless columns

In [5]:
unnamed_col = [i for i in df.columns if i.__contains__('Unnamed')]
print(unnamed_col)    

['Unnamed: 1', 'Unnamed: 3', 'Unnamed: 9', 'Unnamed: 24', 'Unnamed: 30', 'Unnamed: 36', 'Unnamed: 46', 'Unnamed: 59']


In [6]:
df.drop(unnamed_col,axis=1,inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 61 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   Dog breed                                                         175 non-null    object 
 1   category                                                          173 non-null    object 
 2   datadog score                                                     87 non-null     float64
 3   POPULARITY IN US                                                  173 non-null    object 
 4   POPULARITY IN US.1                                                88 non-null     object 
 5   LIFETIME COST, $                                                  172 non-null    object 
 6   5 LIFETIME COST                                                   88 non-null     object 
 7   1 INTELLIGENCE (TRAINABILITY) ranki

In [8]:
# some columns look very similar
df[['POPULARITY IN US','POPULARITY IN US.1','LIFETIME COST, $','5 LIFETIME COST','1 INTELLIGENCE (TRAINABILITY) ranking','INTELLIGENCE (TRAINABILITY) ranking',
   '2 LONGEVITY','LONGEVITY','3 NO. OF GENETIC AILMENTS','GENETIC AILMENTS','4a average purchase price, US$','average purchase price, US$',
   '4b food costs per year, US$','food costs per year, US$','4 LIFETIME COST, $','LIFETIME COST, $']]



Unnamed: 0,POPULARITY IN US,POPULARITY IN US.1,"LIFETIME COST, $",5 LIFETIME COST,1 INTELLIGENCE (TRAINABILITY) ranking,INTELLIGENCE (TRAINABILITY) ranking,2 LONGEVITY,LONGEVITY,3 NO. OF GENETIC AILMENTS,GENETIC AILMENTS,"4a average purchase price, US$","average purchase price, US$","4b food costs per year, US$","food costs per year, US$","4 LIFETIME COST, $","LIFETIME COST, $.1"
0,"1- 173, 1 = most poular, US. American Kennel C...",ranking within breeds with full data only,,"in descending rank (higher score = better), n...",1 = easiest to train / learn new commands,"in descending rank (higher score = better), n...","years, weighted average - see note",in ascending rank order (higher score = better...,serious only,0 = 100% ... 9 = 0%,"mean, from puppyfind.com",reverse ranked percentile,,reverse ranked percentile,,
1,45,39,"$20,143",48%,1,100%,12.52,79%,2,78%,$623,71%,$324,68%,17404,"$20,143"
2,80,61,"$22,638",14%,30,70%,14.00,98%,0,100%,$833,44%,$324,68%,19575,"$22,638"
3,30,30,"$22,589",16%,19,80%,12.92,89%,0,100%,$618,72%,$466,23%,19503,"$22,589"
4,59,48,"$21,992",22%,35,61%,13.84,95%,2,78%,$435,95%,$324,68%,18965,"$21,992"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,32,,no data,,21,,no data,,1,,$562,,,,no data,no data
171,99,,no data,,53,,no data,,0,,$843,,,,no data,no data
172,100,,no data,,51,,13.17,,0,,$668,,,,no data,no data
173,92,,no data,,46,,8.80,,0,,$755,,,,no data,no data


#### it looks like they contain the same information but expressed in a different way. In order to increase clarity, we will keep only one column

In [9]:
df_clean = df.drop(['POPULARITY IN US.1','5 LIFETIME COST','INTELLIGENCE (TRAINABILITY) ranking',
        'LONGEVITY','GENETIC AILMENTS','average purchase price, US$',
        '4b food costs per year, US$'],axis=1).copy()
        

In [10]:
df_clean.drop(['grooming required ','suitability for children '],axis=1,inplace=True)

In [11]:
df_clean.head()

Unnamed: 0,Dog breed,category,datadog score,POPULARITY IN US,"LIFETIME COST, $",1 INTELLIGENCE (TRAINABILITY) ranking,2 LONGEVITY,3 NO. OF GENETIC AILMENTS,"4a average purchase price, US$","food costs per year, US$",...,"food per lifetime, $","Other regular costs, total per lifetime, $","total per year, $","total, per year, £","toys, presents, treats, per year, £","pet sitters, per year, £","grooming, per year, £","vet fees per year, £","kennels per year, £","one offs, $"
0,Additional info,American Kennel Club group,,"1- 173, 1 = most poular, US. American Kennel C...",,1 = easiest to train / learn new commands,"years, weighted average - see note",serious only,"mean, from puppyfind.com",reverse ranked percentile,...,food per week x 52.1775 x average lifespan (co...,"if no average lifespan data, then 'no data'",converted to US $ using Google Finance,,averaged across all dog breeds. pet food exclu...,,,,,"Initial outlay: leash, collar, ID tag; food di..."
1,Border Collie,herding,3.64,45,"$20,143",1,12.52,2,$623,68%,...,3486,13095,1046,784.0,121,126.0,244.0,177.0,116.0,200
2,Border Terrier,terrier,3.61,80,"$22,638",30,14.00,0,$833,68%,...,3898,14643,1046,784.0,121,126.0,244.0,177.0,116.0,200
3,Brittany,sporting,3.54,30,"$22,589",19,12.92,0,$618,23%,...,5171,13514,1046,784.0,121,126.0,244.0,177.0,116.0,200
4,Cairn Terrier,terrier,3.53,59,"$21,992",35,13.84,2,$435,68%,...,3854,14476,1046,784.0,121,126.0,244.0,177.0,116.0,200


In [12]:
df_clean.columns

Index(['Dog breed', 'category', 'datadog score', 'POPULARITY IN US',
       'LIFETIME COST, $', '1 INTELLIGENCE (TRAINABILITY) ranking',
       '2 LONGEVITY', '3 NO. OF GENETIC AILMENTS',
       '4a average purchase price, US$', 'food costs per year, US$',
       '5a grooming required ', '5b suitability for children ',
       'ULTIMATE TOP DOG MEGA SCORE (with kids)',
       'ULTIMATE TOP DATA DOG MEGA RANKING - without kids',
       'ULTIMATE TOP DATA DOG MEGA RANKING - with kids',
       'ULTIMATE TOP DOG MEGA SCORE', 'POPULAR RATING', 'size category',
       'weight (kg)', 'weight (lbs)', 'shoulder height (cm)',
       'shoulder height (in)', 'note', 'intelligence category',
       'repetitions to understand new commands',
       'obey first command (% of time)',
       'longevity sample size - survey sample size',
       'GENETIC DISEASES - congenital ailments: summary / areas affected',
       'congenital ailments: summary / areas affected', 'exclude?',
       'most abandoned?', '

In [13]:
df.head()

Unnamed: 0,Dog breed,category,datadog score,POPULARITY IN US,POPULARITY IN US.1,"LIFETIME COST, $",5 LIFETIME COST,1 INTELLIGENCE (TRAINABILITY) ranking,INTELLIGENCE (TRAINABILITY) ranking,2 LONGEVITY,...,"food per lifetime, $","Other regular costs, total per lifetime, $","total per year, $","total, per year, £","toys, presents, treats, per year, £","pet sitters, per year, £","grooming, per year, £","vet fees per year, £","kennels per year, £","one offs, $"
0,Additional info,American Kennel Club group,,"1- 173, 1 = most poular, US. American Kennel C...",ranking within breeds with full data only,,"in descending rank (higher score = better), n...",1 = easiest to train / learn new commands,"in descending rank (higher score = better), n...","years, weighted average - see note",...,food per week x 52.1775 x average lifespan (co...,"if no average lifespan data, then 'no data'",converted to US $ using Google Finance,,averaged across all dog breeds. pet food exclu...,,,,,"Initial outlay: leash, collar, ID tag; food di..."
1,Border Collie,herding,3.64,45,39,"$20,143",48%,1,100%,12.52,...,3486,13095,1046,784.0,121,126.0,244.0,177.0,116.0,200
2,Border Terrier,terrier,3.61,80,61,"$22,638",14%,30,70%,14.00,...,3898,14643,1046,784.0,121,126.0,244.0,177.0,116.0,200
3,Brittany,sporting,3.54,30,30,"$22,589",16%,19,80%,12.92,...,5171,13514,1046,784.0,121,126.0,244.0,177.0,116.0,200
4,Cairn Terrier,terrier,3.53,59,48,"$21,992",22%,35,61%,13.84,...,3854,14476,1046,784.0,121,126.0,244.0,177.0,116.0,200


#### there are also several columns that express the same value on  $ or £. We will keep just one of each to clarify the dataframe

In [14]:
df_clean.columns

Index(['Dog breed', 'category', 'datadog score', 'POPULARITY IN US',
       'LIFETIME COST, $', '1 INTELLIGENCE (TRAINABILITY) ranking',
       '2 LONGEVITY', '3 NO. OF GENETIC AILMENTS',
       '4a average purchase price, US$', 'food costs per year, US$',
       '5a grooming required ', '5b suitability for children ',
       'ULTIMATE TOP DOG MEGA SCORE (with kids)',
       'ULTIMATE TOP DATA DOG MEGA RANKING - without kids',
       'ULTIMATE TOP DATA DOG MEGA RANKING - with kids',
       'ULTIMATE TOP DOG MEGA SCORE', 'POPULAR RATING', 'size category',
       'weight (kg)', 'weight (lbs)', 'shoulder height (cm)',
       'shoulder height (in)', 'note', 'intelligence category',
       'repetitions to understand new commands',
       'obey first command (% of time)',
       'longevity sample size - survey sample size',
       'GENETIC DISEASES - congenital ailments: summary / areas affected',
       'congenital ailments: summary / areas affected', 'exclude?',
       'most abandoned?', '

In [15]:
[i for i in df_clean.columns if (i.__contains__('$') or i.__contains__('£'))]

['LIFETIME COST, $',
 '4a average purchase price, US$',
 'food costs per year, US$',
 '4 LIFETIME COST, $',
 'average purchase price (US $)',
 'lowest (US$)',
 'highest (US$)',
 'food per week, average, £',
 'min, £',
 'max, £',
 'food per week, $',
 'food per year, $',
 'food per lifetime, $',
 'Other regular costs, total per lifetime, $',
 'total  per year, $',
 'total, per year, £',
 'toys, presents, treats, per year, £',
 'pet sitters, per year, £',
 'grooming, per year, £',
 'vet fees per year, £',
 'kennels per year, £',
 'one offs, $']

In [16]:
df_clean.drop(['food per week, average, £','total, per year, £'],axis=1,inplace=True)

#### the first row it's a description of each column. We'll save it for reference, but drop it from the dataframe itself

In [17]:
columns_desc = df_clean.loc[0,:]

In [18]:
df_clean.drop(0,axis=0,inplace=True)

In [19]:
df_clean.head(5)

Unnamed: 0,Dog breed,category,datadog score,POPULARITY IN US,"LIFETIME COST, $",1 INTELLIGENCE (TRAINABILITY) ranking,2 LONGEVITY,3 NO. OF GENETIC AILMENTS,"4a average purchase price, US$","food costs per year, US$",...,"food per year, $","food per lifetime, $","Other regular costs, total per lifetime, $","total per year, $","toys, presents, treats, per year, £","pet sitters, per year, £","grooming, per year, £","vet fees per year, £","kennels per year, £","one offs, $"
1,Border Collie,herding,3.64,45,"$20,143",1,12.52,2,$623,68%,...,278,3486,13095,1046,121,126.0,244.0,177.0,116.0,200
2,Border Terrier,terrier,3.61,80,"$22,638",30,14.0,0,$833,68%,...,278,3898,14643,1046,121,126.0,244.0,177.0,116.0,200
3,Brittany,sporting,3.54,30,"$22,589",19,12.92,0,$618,23%,...,400,5171,13514,1046,121,126.0,244.0,177.0,116.0,200
4,Cairn Terrier,terrier,3.53,59,"$21,992",35,13.84,2,$435,68%,...,278,3854,14476,1046,121,126.0,244.0,177.0,116.0,200
5,Welsh Springer Spaniel,sporting,3.34,130,"$20,224",31,12.49,1,$750,68%,...,278,3478,13064,1046,121,126.0,244.0,177.0,116.0,200


#### we convert the remaining columns to the same currency (dollars)

In [23]:
columns_to_conv = ['toys, presents, treats, per year, £','pet sitters, per year, £','grooming, per year, £',
                     'vet fees per year, £','kennels per year, £']

In [41]:
df_clean['toys, presents, treats, per year, £'] = df_clean['toys, presents, treats, per year, £'].astype('float')

In [48]:
df_clean[columns_to_conv] = df_clean[columns_to_conv].fillna(0).apply(lambda x: round(x * 1.3611, 2))

In [54]:
df_clean.rename({'toys, presents, treats, per year, £':'toys, presents, treats, per year, $',
                'pet sitters, per year, £':'pet sitters, per year, $',
                'grooming, per year, £':'grooming, per year, $',
                'vet fees per year, £':'vet fees per year, $',
                'kennels per year, £':'kennels per year, $'},
                axis=1,inplace=True)

In [55]:
df_clean.head()

Unnamed: 0,Dog breed,category,datadog score,POPULARITY IN US,"LIFETIME COST, $",1 INTELLIGENCE (TRAINABILITY) ranking,2 LONGEVITY,3 NO. OF GENETIC AILMENTS,"4a average purchase price, US$","food costs per year, US$",...,"food per year, $","food per lifetime, $","Other regular costs, total per lifetime, $","total per year, $","toys, presents, treats, per year, $","pet sitters, per year, $","grooming, per year, $","vet fees per year, $","kennels per year, $","one offs, $"
1,Border Collie,herding,3.64,45,"$20,143",1,12.52,2,$623,68%,...,278,3486,13095,1046,164.69,171.5,332.11,240.91,157.89,200
2,Border Terrier,terrier,3.61,80,"$22,638",30,14.0,0,$833,68%,...,278,3898,14643,1046,164.69,171.5,332.11,240.91,157.89,200
3,Brittany,sporting,3.54,30,"$22,589",19,12.92,0,$618,23%,...,400,5171,13514,1046,164.69,171.5,332.11,240.91,157.89,200
4,Cairn Terrier,terrier,3.53,59,"$21,992",35,13.84,2,$435,68%,...,278,3854,14476,1046,164.69,171.5,332.11,240.91,157.89,200
5,Welsh Springer Spaniel,sporting,3.34,130,"$20,224",31,12.49,1,$750,68%,...,278,3478,13064,1046,164.69,171.5,332.11,240.91,157.89,200
