In [4]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import (
    OneHotEncoder, 
    StandardScaler, 
    OrdinalEncoder
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors


#LOAD THE DATA
df = pd.read_csv("akc-data-latest.csv")  
print("Data loaded. Shape:", df.shape)

# Check columns
print("Columns in the dataset:", df.columns.tolist())
df.head()

Data loaded. Shape: (277, 21)
Columns in the dataset: ['Unnamed: 0', 'description', 'temperament', 'popularity', 'min_height', 'max_height', 'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy', 'group', 'grooming_frequency_value', 'grooming_frequency_category', 'shedding_value', 'shedding_category', 'energy_level_value', 'energy_level_category', 'trainability_value', 'trainability_category', 'demeanor_value', 'demeanor_category']


Unnamed: 0.1,Unnamed: 0,description,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,...,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,Affenpinscher,The Affen’s apish look has been described many...,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,Afghan Hound,"The Afghan Hound is an ancient breed, his whol...","Dignified, Profoundly Loyal, Aristocratic",113,63.5,68.58,22.679619,27.215542,12.0,15.0,...,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,Airedale Terrier,The Airedale Terrier is the largest of all ter...,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,...,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
3,Akita,"Akitas are burly, heavy-boned spitz-type dogs ...","Courageous, Dignified, Profoundly Loyal",47,60.96,71.12,31.751466,58.967008,10.0,13.0,...,0.8,Daily Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,0.6,Alert/Responsive
4,Alaskan Malamute,The Alaskan Malamute stands 23 to 25 inches at...,"Affectionate, Loyal, Playful",58,58.42,63.5,34.019428,38.555351,10.0,14.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly


In [5]:
# Compute average weight and height and expectancy
df["weight_avg"] = (df["min_weight"] + df["max_weight"]) / 2
df["height_avg"] = (df["min_height"] + df["max_height"]) / 2
df["life_expectancy_avg"] = (df["min_expectancy"] + df["max_expectancy"]) / 2

print(df[["min_weight", "max_weight", "weight_avg"]].head())

   min_weight  max_weight  weight_avg
0    3.175147    4.535924    3.855535
1   22.679619   27.215542   24.947580
2   22.679619   31.751466   27.215542
3   31.751466   58.967008   45.359237
4   34.019428   38.555351   36.287390


In [8]:
numeric_features = [
    "weight_avg", 
    "height_avg", 
    "life_expectancy_avg", 
    "popularity",                   
    "grooming_frequency_value",     
    "shedding_value",              
    "energy_level_value",           
    "trainability_value",           
    "demeanor_value"                
]

In [9]:
categorical_features = [
    "group",              
    "grooming_frequency_category",
    "shedding_category",
    "energy_level_category",
    "trainability_category",
    "demeanor_category",
    
    
]

In [14]:
#CLEAN MISSING DATA
all_features_needed = numeric_features + categorical_features
df_clean = df.dropna(subset=all_features_needed).copy()
df_clean = df_clean[df_clean["popularity"] != "of"]

In [None]:
for col in numeric_features:
    # Look for non-numeric entries in this column
    non_numeric_mask = pd.to_numeric(df_clean[col], errors="coerce").isna()
    if non_numeric_mask.any():
        print(f"Column '{col}' has non-numeric values:")
        print(df_clean.loc[non_numeric_mask, col].unique())

In [15]:
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

In [16]:
X = preprocessor.fit_transform(df_clean)

In [18]:
nn_model = NearestNeighbors(n_neighbors=5, metric="euclidean")
nn_model.fit(X)
print("Nearest Neighbors model is fitted.")

Nearest Neighbors model is fitted.


In [22]:
def recommend_dogs(preferences, k=5):
    """
    return the top-k nearest rows from df_clean.
    """
    
    # Turn 'preferences' into a single-row DataFrame then Fill in any missing features with a default or the mean/mode
    row_dict = {}
    for col in all_features_needed:
        if col in preferences:
            row_dict[col] = [preferences[col]]
        else:
           
            if col in numeric_features:
                row_dict[col] = [df_clean[col].mean()]
            else:
                # for categorical, pick the most frequent group
                row_dict[col] = [df_clean[col].mode()[0]]
    
    single_row_df = pd.DataFrame(row_dict)
    
    X_single = preprocessor.transform(single_row_df)
    
    #nearest neighbors
    distances, indices = nn_model.kneighbors(X_single, n_neighbors=k)
    

    recs = df_clean.iloc[indices[0]].copy()
    recs["KNN_distance"] = distances[0]
    return recs

Using

In [23]:
adopter_preferences = {
    "weight_avg": 20,                  # they'd like around 20 (units: lbs or kg depending on your dataset)
    "height_avg": 30,                  # they'd like around 30 cm/in
    "life_expectancy_avg": 12,         # they'd like a dog who lives ~12 years
    "popularity": 50,                  # just an example usage
    "grooming_frequency_value": 2,     # they'd prefer lower grooming needs
    "energy_level_value": 3,           # moderate energy
    "group": "Sporting"                # they want a sporting group dog
    
}

recommendations = recommend_dogs(adopter_preferences, k=30)
print("Recommended dogs for these preferences:")
print(recommendations)

Recommended dogs for these preferences:
                             Unnamed: 0  \
57                             Brittany   
46                        Border Collie   
136                        Irish Setter   
216                     Russell Terrier   
180  Nova Scotia Duck Tolling Retriever   
271         Wirehaired Pointing Griffon   
184              Parson Russell Terrier   
108                       Finnish Spitz   
120                    Golden Retriever   
122        Grand Basset Griffon Vendéen   
264                          Weimaraner   
28                            Beauceron   
121                       Gordon Setter   
138                 Irish Water Spaniel   
115          German Shorthaired Pointer   
129                             Harrier   
109               Flat-Coated Retriever   
203                Portuguese Water Dog   
124                      Great Pyrenees   
243                   Spanish Water Dog   
117           German Wirehaired Pointer   
153           