In [1]:
# import neccessary modules
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import classification_report, roc_auc_score, r2_score, mean_absolute_error

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import hstack
from model_components import DenseCT, TextFeaturizer

In [13]:
# read dataset
df = pd.read_csv("dataset/processed/synthetic-dog-breed-health-data-clean.csv")

# preprocess some fields
df["Healthy_bin"] = df["Healthy"].str.lower().map({"yes":1, "no":0})

In [3]:
# 3) Combine text fields into a single field
def combine_text(row):
    parts = []
    for c in ["Medications", "Seizures"]:
        val = row.get(c, None)
        if pd.notna(val):
            s = str(val).strip()
            if s:
                parts.append(s)
    return " | ".join(parts)

In [4]:
# apply the function to create the 'notes' column
df["notes"] = df.apply(combine_text, axis=1)

# check if 'user_text' column exists, if not create it with empty strings
if "user_text" not in df.columns:
    df["user_text"] = ""

# feature columns
common_num = ["Age", "Weight (lbs)", "Daily Walk Distance (miles)",
              "Hours of Sleep", "Annual Vet Visits", "Average Temperature (F)"]
common_cat = ["Breed", "Breed Size", "Sex", "Spay/Neuter Status",
              "Owner Activity Level" if "Owner Activity Level" in df.columns else None,
              "Other Pets in Household" if "Other Pets in Household" in df.columns else None]
# Ensure only existing columns are included
common_cat = [c for c in common_cat if c and c in df.columns]

# define transformers
num_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=5))
])

In [5]:

# make preprocessor function
def make_preprocessor(num_cols, cat_cols):
    return ColumnTransformer(
        transformers=[
            ("num", num_tf, [c for c in num_cols if c in df.columns]),
            ("cat", cat_tf, [c for c in cat_cols if c in df.columns]),
        ],
        remainder="drop"
    ), TfidfVectorizer(ngram_range=(1,2), max_features=20000)

In [6]:
# Task 1: Daily Activity Level (multi-class classification)
X = df.copy()
X["__joined_text__"] = (X["notes"].fillna("") + " " + X.get("user_text","").fillna(""))
y_act = X["Daily Activity Level"].astype(str)

pre_ct, tfidf = make_preprocessor(common_num, common_cat)
ct_feat   = DenseCT(pre_ct)
text_feat = TextFeaturizer(tfidf)

X_train, X_val, y_train, y_val = train_test_split(X, y_act, test_size=0.2, stratify=y_act, random_state=42)
ct_feat.fit(X_train, y_train)
text_feat.fit(X_train, y_train)

le_act = LabelEncoder().fit(y_train)
Xtr = hstack([ct_feat.transform(X_train), text_feat.transform(X_train)])
Xva = hstack([ct_feat.transform(X_val),   text_feat.transform(X_val)])

clf_act = LogisticRegression(max_iter=2000, class_weight="balanced", solver="saga")
clf_act.fit(Xtr, le_act.transform(y_train))
pred_act = le_act.inverse_transform(clf_act.predict(Xva))
print("=== Activity ===")
print(classification_report(y_val, pred_act, zero_division=0))

=== Activity ===
              precision    recall  f1-score   support

      Active       0.33      0.32      0.33       121
         Low       0.26      0.22      0.24       121
    Moderate       0.25      0.22      0.23       114
 Very Active       0.26      0.34      0.29       110

    accuracy                           0.27       466
   macro avg       0.27      0.28      0.27       466
weighted avg       0.27      0.27      0.27       466



In [7]:
# Task 2: Diet (multi-class classification)
X2 = df.copy()
X2["__joined_text__"] = (X2["notes"].fillna("") + " " + X2.get("user_text","").fillna(""))
y_diet = X2["Diet"].astype(str)

ct2, tfidf2 = make_preprocessor(common_num, [c for c in common_cat if c != "Diet"])
ct2_feat = DenseCT(ct2)
txt2_feat = TextFeaturizer(tfidf2)

X2_tr, X2_va, y2_tr, y2_va = train_test_split(X2, y_diet, test_size=0.2, stratify=y_diet, random_state=42)
ct2_feat.fit(X2_tr, y2_tr)
txt2_feat.fit(X2_tr, y2_tr)

le_diet = LabelEncoder().fit(y2_tr)
X2tr = hstack([ct2_feat.transform(X2_tr), txt2_feat.transform(X2_tr)])
X2va = hstack([ct2_feat.transform(X2_va), txt2_feat.transform(X2_va)])

clf_diet = LogisticRegression(max_iter=2000, class_weight="balanced", solver="saga").fit(X2tr, le_diet.transform(y2_tr))
pred_diet = le_diet.inverse_transform(clf_diet.predict(X2va))
print("=== Diet ===")
print(classification_report(y2_va, pred_diet, zero_division=0))

=== Diet ===
              precision    recall  f1-score   support

   Hard food       0.23      0.25      0.24       111
 Home cooked       0.21      0.21      0.21       118
Special diet       0.22      0.29      0.25       111
    Wet food       0.29      0.19      0.23       126

    accuracy                           0.23       466
   macro avg       0.24      0.24      0.23       466
weighted avg       0.24      0.23      0.23       466



In [8]:
# Task 3: Play Time (hrs) (regression)
y_play = df["Play Time (hrs)"].astype(float)
X3 = df.copy()
X3["__joined_text__"] = (X3["notes"].fillna("") + " " + X3.get("user_text","").fillna(""))

ct3, tfidf3 = make_preprocessor(common_num, common_cat)
ct3_feat = DenseCT(ct3)
txt3_feat = TextFeaturizer(tfidf3)

X3_tr, X3_va, y3_tr, y3_va = train_test_split(X3, y_play, test_size=0.2, random_state=42)
ct3_feat.fit(X3_tr, y3_tr)
txt3_feat.fit(X3_tr, y3_tr)

X3tr_sparse = hstack([ct3_feat.transform(X3_tr), txt3_feat.transform(X3_tr)]).tocsr()
X3va_sparse = hstack([ct3_feat.transform(X3_va), txt3_feat.transform(X3_va)]).tocsr()

reg_play = HistGradientBoostingRegressor().fit(X3tr_sparse.toarray(), y3_tr)
y3_pred = reg_play.predict(X3va_sparse.toarray())

print("=== Play Time ===")
print("R2=", r2_score(y3_va, y3_pred), "MAE=", mean_absolute_error(y3_va, y3_pred))

=== Play Time ===
R2= -0.1375381216701299 MAE= 0.9031197343148666


In [9]:
# Task 4: Healthy (binary classification)
y_h = df["Healthy_bin"].astype(int)
X4 = df.copy()
X4["__joined_text__"] = (X4["notes"].fillna("") + " " + X4.get("user_text","").fillna(""))

ct4, tfidf4 = make_preprocessor(common_num, common_cat)
ct4_feat = DenseCT(ct4)
txt4_feat = TextFeaturizer(tfidf4)

X4_tr, X4_va, y4_tr, y4_va = train_test_split(X4, y_h, test_size=0.2, stratify=y_h, random_state=42)
ct4_feat.fit(X4_tr, y4_tr)
txt4_feat.fit(X4_tr, y4_tr)

clf_h = LogisticRegression(max_iter=2000, class_weight="balanced", solver="saga")
clf_h.fit(hstack([ct4_feat.transform(X4_tr), txt4_feat.transform(X4_tr)]), y4_tr)
proba_h = clf_h.predict_proba(hstack([ct4_feat.transform(X4_va), txt4_feat.transform(X4_va)]))[:, 1]
print("=== Healthy ===")
print("ROC-AUC=", roc_auc_score(y4_va, proba_h))

=== Healthy ===
ROC-AUC= 0.8526914300677065


In [11]:
# save all components
bundle = {
    "activity": {"ct": ct_feat, "tfidf": text_feat, "clf": clf_act, "label_encoder": le_act},
    "diet":     {"ct": ct2_feat, "tfidf": txt2_feat, "clf": clf_diet, "label_encoder": le_diet},
    "play":     {"ct": ct3_feat, "tfidf": txt3_feat, "reg": reg_play},
    "healthy":  {"ct": ct4_feat, "tfidf": txt4_feat, "clf": clf_h},
}
joblib.dump(bundle, "wooftalk_multi_tasks.joblib")
print("Saved -> wooftalk_multi_tasks.joblib")

Saved -> wooftalk_multi_tasks.joblib
