In [66]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [82]:
df = pd.read_csv("dogdata.csv")
df['Breed'] = df.iloc[:,0]
df1 = df[['Breed', 'description', 'temperament']]
df1 = df1.dropna()
df1['combined'] = df1['description'] + ' ' + df['temperament']
breed_desc_data = df1[['Breed', 'combined']]

In [85]:
# Custom tokenizer to remove punctuation
def custom_tokenizer(text):
    # Remove punctuation and split by whitespace
    return re.sub(r'[^\w\s]', '', text).split()

# Apply custom tokenizer during vectorization
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(breed_desc_data['combined'])  # (m x n)

def estimate_dog_breedtfidf(tfidf_matrix, y):
    # Prepare data for use in loss function
    target_vector = vectorizer.transform([y])  # (1 x n)
    # Convert to dense arrays
    X = tfidf_matrix.toarray().T  # Sentence matrix (m x n)
    v_x = target_vector.toarray().flatten()  # Target vector (1 x n)
    
    # Defining the loss function (SSE) to minimise
    def lossSSE(w):
        est = X @ w
        diffsqrd = (est - v_x) ** 2
        SSE = diffsqrd.sum()
        return SSE
        
    # Defining constraint: Elements of w must sum to 1
    constraint = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
    
    # Initial guess of weights (Random values between 0 and 1)
    w0 = np.random.uniform(0, 1, size=X.shape[1])
    
    # Normalise initial guess to meet constraint
    w0 /= np.sum(w0)
    
    # Define bounds [0,1] for all weights
    bounds = [(0, 1) for _ in range(X.shape[1])]
    
    # Minimize using SLSQP (supports constraints)
    result = minimize(lossSSE, w0, method='SLSQP', bounds=bounds, constraints=[constraint])
    w_adjusted = result.x
    
    # Find indices of top 4 values
    top_indices = np.argsort(w_adjusted)[-4:]
    
    # Create a mask to zero out other values
    mask = np.zeros_like(w_adjusted, dtype=bool)
    mask[top_indices] = True
    
    # Update array with the mask
    w_adjusted[~mask] = 0
    
    # Renormalize to sum to 1
    if np.sum(w_adjusted) > 0:
        w_adjusted /= np.sum(w_adjusted)  # Normalize to sum to 1
    # Discard weights less than 0.1
    w_adjusted = np.where(result.x < 0.1, 0, result.x)
    # Renormalize to sum to 1
    if np.sum(w_adjusted) > 0:
        w_adjusted /= np.sum(w_adjusted)  # Normalize to sum to 1
    return w_adjusted

# Defining some functions that allow us to analyse and display results:

def get_breed0(w_adjusted):
    breed_indices = [i for i in range(len(w_adjusted)) if w_adjusted[i] != 0]
    if len(breed_indices) == 1:
        return breed_indices[0]
    else:
        return None

def get_breeds(w_adjusted):
    breed_indices = [i for i in range(len(w_adjusted)) if w_adjusted[i] != 0]
    if breed_indices:
        return breed_indices
    else:
        return None
    
def print_breed_guess(w_adjusted):
    # Print results
    percents = [round(i*100, 0) for i in w_adjusted if i != 0]
    breeds = [breed_desc_data.iloc[i, 0] for i in range(len(w_adjusted)) if w_adjusted[i] != 0]
    major_dog_i = np.argmax(percents)
    if not breeds:
        print("We were unable to determine a breed mixture for your dog")
    else:
        print('We estimate your dog to be:')
        for i in range(len(breeds)):
            print(percents[i], '% ', breeds[i])
        print(f'The {breeds[major_dog_i]} has the following traits and demands:')
        row = df[df['Breed'] == breeds[major_dog_i]]
        metrics = ['grooming_frequency_category','shedding_category','energy_level_category','trainability_category','demeanor_category']
        for i in metrics:
            print(f'{i} = {row[i].iloc[0]}')

### Tests

In [None]:
data = breed_desc_data.drop(columns=['Breed'])
assert get_breed0(estimate_dog_breedtfidf(tfidf_matrix, data.iloc[0]['combined'])) == 0, "Should be classed as breed 0, i.e Affenpinscher"
print_breed_guess(estimate_dog_breedtfidf(tfidf_matrix, data.iloc[0]['combined']))

We estimate your dog to be:
100.0 %  Affenpinscher


### Taking User Inputs

In [88]:
y = input('Please describe the dog as best as you can. This may include anything from the dogs physical appearance to the dogs percieved temperament or character...')
print_breed_guess(estimate_dog_breedtfidf(tfidf_matrix, y))

Please describe the dog as best as you can. This may include anything from the dogs physical appearance to the dogs percieved temperament or character... playful boy who is eager to find a loving and patient home where he can thrive. He enjoys attention but isn't always used to it, and sometimes can find all the fuss a bit overwhelming. Ted will need a home where his new family understands his need for gentle interaction and patience as he adjusts.  Ted will need a private secure garden so he can have regular off lead exercise.


We estimate your dog to be:
17.0 %  Bulldog
34.0 %  Chinook
30.0 %  German Wirehaired Pointer
20.0 %  Stabyhoun
The Chinook has the following traits and demands:
grooming_frequency_category = Weekly Brushing
shedding_category = Seasonal
energy_level_category = Energetic
trainability_category = Eager to Please
demeanor_category = Friendly


### Accuracy

In [None]:
# Calculating Accuracy for Tfidf - SSE function
count = 0
for i in range(len(data)):
    if i == get_breed0(estimate_dog_breedtfidf(tfidf_matrix, data.iloc[i]['combined'])):
        count += 1
    if i%5==0:
        print("Breed", i, "complete.")
accuracy_SSE = count*100/len(data)
print("Accuracy using SSE loss on on Tfidf processed descriptions from Pure Breed Data = ", accuracy_SSE, "%")
# Accuracy ~ 99%