In [6]:
import pandas as pd
import os
import sqlite3

# Set path to the raw dog traits file
file_path = os.path.join("..", "data", "raw", "akc_dog_traits.csv")

# Load the dataset
df = pd.read_csv(file_path)

# Display basic structure
print("Dataset loaded! Shape:", df.shape)
display(df.head())

# Show column info and check for missing data
print("\nDataset Info:")
df.info()

print("\nMissing Values:")
print(df.isnull().sum())

# Unique values in first few traits
print("\nSample Unique Trait Values:")
for col in df.columns[1:6]:
    print(f"{col}: {df[col].unique()}")


Dataset loaded! Shape: (195, 17)


Unnamed: 0,Breed,Affectionate With Family,Good With Young Children,Good With Other Dogs,Shedding Level,Coat Grooming Frequency,Drooling Level,Coat Type,Coat Length,Openness To Strangers,Playfulness Level,Watchdog/Protective Nature,Adaptability Level,Trainability Level,Energy Level,Barking Level,Mental Stimulation Needs
0,Retrievers (Labrador),5,5,5,4,2,2,Double,Short,5,5,3,5,5,5,3,4
1,French Bulldogs,5,5,4,3,1,3,Smooth,Short,5,5,3,5,4,3,1,3
2,German Shepherd Dogs,5,5,3,4,2,2,Double,Medium,3,4,5,5,5,5,3,5
3,Retrievers (Golden),5,5,5,4,2,2,Double,Medium,5,4,3,5,5,3,1,4
4,Bulldogs,4,3,3,3,3,3,Smooth,Short,4,4,3,3,4,3,2,3



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Breed                       195 non-null    object
 1   Affectionate With Family    195 non-null    int64 
 2   Good With Young Children    195 non-null    int64 
 3   Good With Other Dogs        195 non-null    int64 
 4   Shedding Level              195 non-null    int64 
 5   Coat Grooming Frequency     195 non-null    int64 
 6   Drooling Level              195 non-null    int64 
 7   Coat Type                   195 non-null    object
 8   Coat Length                 195 non-null    object
 9   Openness To Strangers       195 non-null    int64 
 10  Playfulness Level           195 non-null    int64 
 11  Watchdog/Protective Nature  195 non-null    int64 
 12  Adaptability Level          195 non-null    int64 
 13  Trainability Level          195 non

In [7]:
def assign_dog_type(row):
    if (
        row['Affectionate With Family'] >= 4 and
        row['Good With Young Children'] >= 4 and
        row['Good With Other Dogs'] >= 4
    ):
        return 'Friendly Family Dog'

    elif (
        row['Watchdog/Protective Nature'] >= 4 and
        row['Trainability Level'] >= 3
    ):
        return 'Watchdog/Protector'

    elif (
        row['Energy Level'] >= 4 and
        row['Playfulness Level'] >= 4
    ):
        return 'Energetic Adventurer'

    elif (
        row['Affectionate With Family'] <= 3 and
        row['Adaptability Level'] <= 3
    ):
        return 'Independent Thinker'

    elif (
        row['Trainability Level'] >= 4 and
        row['Mental Stimulation Needs'] >= 3
    ):
        return 'Brainy Pup'

    elif (
        row['Barking Level'] <= 2 and
        row['Energy Level'] <= 2 and
        row['Affectionate With Family'] >= 4
    ):
        return 'Low-Key Companion'

    else:
        return 'Balanced All-Rounder'

# Apply the function to assign dog type
df['dog_type'] = df.apply(assign_dog_type, axis=1)

# Preview results
display(df[['Breed', 'dog_type']].head())

# Review data distribution
print("\n Dog Type Distribution:")
print(df['dog_type'].value_counts())


Unnamed: 0,Breed,dog_type
0,Retrievers (Labrador),Friendly Family Dog
1,French Bulldogs,Friendly Family Dog
2,German Shepherd Dogs,Watchdog/Protector
3,Retrievers (Golden),Friendly Family Dog
4,Bulldogs,Brainy Pup



 Dog Type Distribution:
dog_type
Watchdog/Protector      92
Friendly Family Dog     42
Energetic Adventurer    19
Brainy Pup              18
Independent Thinker     12
Balanced All-Rounder    12
Name: count, dtype: int64


In [8]:
# OCEAN = Big Five personality traits:
# O = Openness (creativity, curiosity)
# C = Conscientiousness (organization, responsibility)
# E = Extraversion (sociability, assertiveness)
# A = Agreeableness (kindness, cooperation)
# N = Neuroticism (emotional sensitivity, anxiety)

# Define average OCEAN scores for each dog_type on a 1–5 scale
dog_type_traits = {
    'Friendly Family Dog':     [3, 3, 4, 5, 2],
    'Watchdog/Protector':      [2, 4, 2, 2, 4],
    'Energetic Adventurer':    [4, 2, 5, 3, 3],
    'Brainy Pup':              [5, 4, 3, 3, 2],
    'Independent Thinker':     [3, 2, 1, 1, 2],
    'Balanced All-Rounder':    [3, 3, 3, 3, 3]
}

# Convert the dictionary into a DataFrame
dog_type_df = pd.DataFrame.from_dict(
    dog_type_traits,
    orient='index',
    columns=['O', 'C', 'E', 'A', 'N']
).reset_index().rename(columns={'index': 'dog_type'})

# Preview the result
display(dog_type_df)



Unnamed: 0,dog_type,O,C,E,A,N
0,Friendly Family Dog,3,3,4,5,2
1,Watchdog/Protector,2,4,2,2,4
2,Energetic Adventurer,4,2,5,3,3
3,Brainy Pup,5,4,3,3,2
4,Independent Thinker,3,2,1,1,2
5,Balanced All-Rounder,3,3,3,3,3


In [9]:
# Save the dog type trait profiles
dog_type_df.to_csv('../data/processed/dog_type_traits.csv', index=False)

print("Saved to data/processed/dog_type_traits.csv")


Saved to data/processed/dog_type_traits.csv


Saving to SQLite Database

In [10]:

# Path to SQLite database
db_path = os.path.join("..", "data", "processed", "dog_personality.db")

# Connect and save to SQL
conn = sqlite3.connect(db_path)
df.to_sql("akc_breeds", conn, if_exists="replace", index=False)
conn.close()

print("Cleaned AKC dog breed traits saved to SQLite as 'akc_breeds'")

Cleaned AKC dog breed traits saved to SQLite as 'akc_breeds'
