In [1]:
import numpy as np
import pandas as pd
import re
import requests
from collections import Counter
import dogfunctions as dd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
url = "https://raw.github.com/katyakraft/project_1/main/NYC_Dog_Licensing_Dataset_20240923.csv"

In [None]:

df = pd.read_csv(url)

In [None]:
#Cleaning column names
df.columns = [col.replace(" ","_").lower() for col in df.columns]


In [None]:
#Dropping invalid names
names_to_drop = ["UNKNOWN", "NAME NOT PROVIDED"]

df_cleaned = df[~df['animalname'].isin(names_to_drop)]


In [None]:
#Lowercase names
df_lowercase = df_cleaned.map(lambda s: s.lower() if type(s) == str else s)
df_lowercase["animalname"] = df_lowercase["animalname"].str.capitalize()


In [None]:
#Dropping duplicates
df_no_duplicates = df_lowercase.drop_duplicates()


In [None]:
#Calculate name lenghts
df_no_duplicates['namelength'] = df_no_duplicates['animalname'].apply(lambda name: len(name)if pd.notnull(name) else 0)

In [None]:
#Dropping unused columns
new_df = df_no_duplicates.drop(["licenseexpireddate","licenseissueddate", "extract_year"], axis=1)


In [None]:
#Removing whitespaces and lowercase breedname
new_df['breedname'] = new_df['breedname'].str.strip().str.lower()

In [None]:
#Get API data 
new_df['breedgroup'] = new_df['breedname'].apply(dd.breed_groups)
dd.get_breed_data()

In [None]:

# Call the function to get breed list
df_breeds = dd.get_breed_data()


In [None]:
#Add characteristics columns
df_breeds.columns = [col.replace(" ","_").lower() for col in df_breeds.columns]
df_breeds.columns = [col.replace("_","").lower() for col in df_breeds.columns]
result_characteristics = dd.count_characteristics(df_breeds)

dd.add_columns(df_breeds, result_characteristics)



#group breeds together
df_breeds['breedgroup'] = df_breeds['breedname'].apply(dd.breed_groups)


# Replace 'None' in the 'breedgroup' column with values from the 'breedname' column
df_breeds.loc[df_breeds['breedgroup'].isna(), 'breedgroup'] = df_breeds['breedname']




#drop duplicates of breedgroup names
df_breeds = df_breeds.drop_duplicates(subset=["breedgroup"])
#df_breeds['breedgroup'].value_counts()



In [None]:
#Merge data sources
merged_df = pd.merge(new_df, df_breeds, on='breedgroup', how='left')
merged_df.loc[merged_df['breedgroup'].isna(), 'breedgroup'] = merged_df['breedname_x']
merged_df['breedgroup'].isnull().sum()

merged_df.drop(columns=["breedname_y"], inplace=True, errors='raise')

merged_df = merged_df.rename(columns={'animalname': 'animal_name', 
                                      'animalgender': 'animal_gender', 
                                      'animalbirthyear': 'animal_birthyear', 
                                      'breedname_x': 'breed_name', 
                                      'namelength': 'name_length',
                                      'breedgroup': 'breed_group',
                                      'bredfor': 'bred_for'
                                     })

# Adding data for Chihuahua and Dachshund
dog_data = {
    'chihuahua': {
        'breed_group': 'chihuahua',
        'lifespan': '12-20 years',
        'temperament': 'Devoted, Lively, Alert',
        'weight': '1-6 pounds (0.5-2.7 kg)',
        'height': '5-8 inches (13-20 cm)',
        'bred_for': 'Companionship',
        'origin': 'Mexico',
        'intelligent': True,
        'affectionate': True,
        'alert': True,
        'friendly': True,
        'loyal': True
    },
    'dachshund': {
        'breed_group': 'dachshund',
        'lifespan': '12-16 years',
        'temperament': 'Clever, Stubborn, Devoted',
        'weight': '16-32 pounds (7-14.5 kg)',
        'height': '8-9 inches (20-23 cm)',
        'bred_for': 'Scent hound, hunting badgers',
        'origin': 'Germany',
        'intelligent': True,
        'affectionate': True,
        'alert': True,
        'friendly': False,
        'loyal': True
    }
}

# Update the DataFrame
for breed, data in dog_data.items():
    # Check if the breed exists in the DataFrame
    if breed in merged_df['breed_group'].values:
        
        for column, value in data.items():
            merged_df.loc[merged_df['breed_group'] == breed, column] = value

#capitalize names
merged_df['animal_name']=merged_df['animal_name'].str.capitalize()

#drop unused columns
merged_df = merged_df.drop(['lifespan', 'temperament', 'height', 'bred_for', 'origin','zipcode'],axis=1)








In [None]:
#Count gender distribution
total_gender=merged_df['animal_gender'].value_counts()


In [None]:

#Total gender graph
plt.figure(figsize=(6, 6))
plt.pie(total_gender, labels=total_gender.index, autopct='%1.1f%%', colors=plt.cm.tab20.colors)
plt.title('Total Dogs by gender')
plt.show()

In [None]:
#Remove unknown from breedgroups
merged_df=merged_df[~merged_df['breed_group'].isin(['unknown'])]

In [None]:
#Find top 10 of breeds
top_10_breeds = merged_df['breed_group'].value_counts().nlargest(10)


plt.figure(figsize=(10,6))
top_10_breeds.plot(kind='bar', color='skyblue')


plt.title('Top 10 Dog Breeds', fontsize=16)
plt.xlabel('Breed', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()


plt.show()

In [None]:
#gender of the top 5 breeds
top_5_breeds= merged_df['breed_group'].value_counts().nlargest(5).index

# Step 2: Filter the data to include only the top 5 breeds
top_5_breeds_df = merged_df[merged_df['breed_group'].isin(top_5_breeds)]


breed_gender_counts=top_5_breeds_df.groupby(['breed_group','animal_gender']).size().reset_index(name='Count')

plt.figure(figsize=(10,6))
sns.barplot(x='breed_group', y='Count', hue='animal_gender', data=breed_gender_counts, palette='Set2')

# Customize the plot
plt.title('Gender Distribution in Top 5 Dog Breeds', fontsize=16)
plt.xlabel('Breed', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# names we don't want
unwanted_names = ["Unknow","Name","A",".","Q","M","B"]

# Filter out rows where 'animal_name' is in the unwanted list
merged_df=merged_df[~merged_df['animal_name'].isin(unwanted_names)]

merged_df =merged_df.dropna(subset=['animal_name'])

In [None]:
#Count occurrences of each dog name within each breed
breed_name_counts = merged_df.groupby(['breed_group', 'animal_name']).size().reset_index(name='Count')

#Identify the top 5 breeds by the total number of dogs
top_5_breeds = breed_name_counts.groupby('breed_group')['Count'].sum().nlargest(5).index


# Filter for only those breeds that are in the top 5
top_5_breed_data = breed_name_counts[breed_name_counts['breed_group'].isin(top_5_breeds)]


# For each breed, get the top 3 names based on the count
top_3_names_per_breed = top_5_breed_data.groupby('breed_group').apply(
    lambda x: x.nlargest(3, 'Count')
).reset_index(drop=True)


#This line creates a pivot table from the filtered data (top 3 names of top 5 breeds).
stacked_data = top_3_names_per_breed.pivot(index='breed_group', columns='animal_name', values='Count').fillna(0)


#Plotting
plt.figure(figsize=(14, 10))
ax = stacked_data.plot(kind='barh', stacked=True, color=sns.color_palette('Set2'))


#Annotate the dog names on each color of each bar
for i, (index, row) in enumerate(stacked_data.iterrows()):
    cumulative_height = 0
    for name, count in row.items():
        if count > 0:  # Only annotate bars with a positive count
            ax.text(cumulative_height + count / 2, i, name, 
                    ha='center', va='center', color='white', fontsize=10)
            cumulative_height += count


plt.title('Top 3 Dog Names for the Top 5 Dog Breeds ', fontsize=18, weight='bold')
plt.xlabel('Count', fontsize=12)
plt.ylabel('Dog Breeds', fontsize=12)
plt.legend(title='Dog Names')
plt.tight_layout()
plt.show()

In [None]:
#Calculate average name lenght
breed_name_length_avg = merged_df.groupby('breed_group')['name_length'].mean().reset_index()



In [None]:
#top 10 of names
top_10_names = merged_df['animal_name'].value_counts().nlargest(10)


plt.figure(figsize=(10,6))
top_10_names.plot(kind='bar', color='skyblue')


plt.title('Top 10 Dog Names', fontsize=16)
plt.xlabel('Name', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()


plt.show()

In [None]:
#top 10 of names
top_10_name_length = merged_df['name_length'].value_counts().nlargest(10)


plt.figure(figsize=(10,6))
top_10_name_length.plot(kind='bar', color='skyblue')


plt.title('Top 10 Dog name length', fontsize=16)
plt.xlabel('Name', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()


plt.show()

In [None]:
#Analyse personality traits

new_merged_df = merged_df[["animal_name", "intelligent", "affectionate", "alert", "friendly", "loyal"]]
#Drop empty values
cleaned_df = new_merged_df.dropna(subset=['intelligent', 'affectionate', 'alert', 'friendly', 'loyal'], how='all')

In [None]:
#Combine popular names with traits
top_names = cleaned_df['animal_name'].value_counts().head(10)
df_popular_names = cleaned_df[cleaned_df['animal_name'].isin(top_names.index)]
df_popular_names.groupby('animal_name').sum()

In [None]:
#Create the graphs
plt.figure(figsize=(10, 6))
sns.barplot(x='animal_name', y='intelligent', data=df_popular_names)
plt.title('Intelligent Trait by Dog Name')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='animal_name', y='friendly', data=df_popular_names)
plt.title('Friendly Trait by Dog Name')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='animal_name', y='loyal', data=df_popular_names)
plt.title('Loyal Trait by Dog Name')
plt.show()

In [None]:
name_traits = df_popular_names.groupby('animal_name').mean()

# Plot each trait on top of the other
name_traits.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Distribution of Traits by Dog Name')
plt.xticks(rotation=45)
plt.show()

In [None]:
name_traits = df_popular_names.groupby('animal_name').mean()

# Get the color palette from Seaborn
colors = sns.color_palette('Set2')

# Plot each trait on top of the other using a stacked bar chart
plt.figure(figsize=(12, 8))
name_traits.plot(kind='bar', stacked=True, figsize=(12, 8), color=colors)

# Add title and rotate x-axis labels for better readability
plt.title('Distribution of Traits by Dog Name')
plt.xticks(rotation=45)
plt.tight_layout()  # Adjust layout to fit everything
plt.show()

In [None]:
# Pivot the data to focus on traits
df_melted = df_popular_names.melt(id_vars='animal_name', var_name='trait', value_name='presence')

# Filter for rows where the trait is present (presence == 1)
df_melted = df_melted[df_melted['presence'] == 1]

# Group by traits and count names per trait
name_per_trait = df_melted.groupby(['trait', 'animal_name']).size().unstack(fill_value=0)

# Plot stacked bar chart with traits as bars
name_per_trait.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Most Popular Dog Names per Personality Trait')
plt.xlabel('Personality Trait')
plt.ylabel('Count of Names')
plt.xticks(rotation=45)
plt.legend(title="Dog Name", bbox_to_anchor=(1.05, 1), loc='upper left')  # Move legend outside the plot
plt.tight_layout()
plt.show()

In [None]:

# Traits per dog name
dd.plot_radar(df_popular_names,'Bella')
dd.plot_radar(df_popular_names,'Charlie')

In [None]:

top_names = ["Bella", "Max", "Charlie", "Luna", "Coco"] 
dd.popularity_overtime(merged_df)

In [None]:
#remove invalid birth years 
merged_df['animal_birthyear'] = pd.to_numeric(merged_df['animal_birthyear'], errors='coerce')
#create a copy with valid birth years
only_with_birthyear_df = merged_df.dropna(subset=['animal_birthyear'])
only_with_birthyear_df['animal_birthyear'] = pd.to_numeric(only_with_birthyear_df['animal_birthyear'].astype(int), errors='coerce')


dd.top_increased_popularity(only_with_birthyear_df)

In [None]:
dd.gender_natural_names(merged_df)

In [None]:
dd.name_vs_weight(merged_df)