In [1]:
# Imports 

import sys 
import os 

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.append(project_root)

from recommender import recommender_functions as rf
import pandas as pd 
import numpy as np

In [2]:
data = pd.read_csv('../data/data_reduced.csv')
raw_data = pd.read_csv('../data/data_notscaled.csv')
traits = data[['Factor1', 'Factor2', 'Factor3', 'Factor4']]

Now that I have my 4 factors I can build my recommender. On the user side of things, I would like to give the user the option to see recommended cats that are most or least like their own cat in each factor. For example, someone may want a cat that is similar to their current cat in terms of dominance and intelligence, but the opposite in terms of activity level. I would also like to give them the choice to filter for a specific cat sex, otherwise they will see both sexes by default. Lastly, for their ease of access each new user entry should still input values on a scale of 1-7 for their factors. 

Procedure:

1. Design a mapping function that maps user inputs of 1-7 into the same number space as the factor scores

2. Create a custom distance that is minimized if the user chose to see recommendations that are the most similar in a trait, and maximized if they wish to see the opposite

3. Building the recommender with the following parameters:

   - new_user: Fac tor scores for the new user
   - data: database of cats
   - same_traits: list of traits they wish to see that are similar to their current cat 
   - different_traits: list of traits they wish to see that are different to their current cat
   - n_recs: number of recommendations, default 10
   - factors: factors they wish to compare, default all factors but user could choose to only compare select factors
   - sex: allows users to filter for a specific cat sex, defaulted to both sexes   

### 1. Mapping Function

In [3]:
traits.describe()

Unnamed: 0,Factor1,Factor2,Factor3,Factor4
count,2764.0,2764.0,2764.0,2764.0
mean,-28.606068,34.033113,29.409346,2.482399
std,24.044415,16.210881,7.359911,6.892255
min,-83.750412,-8.338428,-3.164901,-21.349545
25%,-47.213599,22.168206,24.848824,-1.953306
50%,-30.650118,33.298019,29.784935,2.551457
75%,-11.223868,45.074779,34.177347,6.881129
max,53.657184,84.070904,50.657065,27.338846


In [12]:
means = np.array([-28.61, 34.03, 29.41, 2.44])
stds = np.array([24.04, 16.21, 7.36, 6.89])

def input_to_factor(user_input):
    """
    Transforms a new user's 1–7 input into the same scale as factor scores.
    """
    # Convert 1–7 scale to z-score-like values in range [-3, 3]
    user_input = np.array(user_input)
    z_approx = (user_input - 4) * (6 / 6)  # i.e., 1 -> -3, 4 -> 0, 7 -> +3

    # Map z_approx into the actual distribution of factor scores
    transformed = z_approx * stds + means
    return transformed

### 2. Custom Distance

In [13]:
# Defining the maxmimun distance squared for each trait

max_dists_sq = (traits.max() - traits.min()) ** 2

In [14]:
def distance(user1, user2, same_traits=None, different_traits=None, factors=None):

    if factors is not None:
        user1 = pd.Series(user1, index=factors)
        user2 = pd.Series(user2, index=factors)
    else:
        user1 = pd.Series(user1)
        user2 = pd.Series(user2)
        
    diff = user1 - user2 
    dist = 0.0
    n = len(user1)

    # if no traits are specified to be similar or different, the default is to recommend based on similarities in all traits 

    if same_traits is None and different_traits is None:
        same_traits = user1.index
        different_traits = []
        
    elif same_traits is None:
        same_traits = [i for i in user1.index if i not in different_traits]
        
    elif different_traits is None:
        different_traits = [i for i in user1.index if i not in same_traits]
        
    for i in same_traits:
        dist += diff.loc[i] ** 2

    for i in different_traits:
        max_val = max_dists_sq.loc[i]
        dist += (max_val - diff.loc[i] ** 2)

    return dist

### 3. Recommender 

In [15]:
def recommend(new_user, data, same_traits=None, different_traits=None, n_recs=10, factors=None, sex='all'):
    
    distances = []

    # Factor names must match data columns
    all_factors = ['Factor1', 'Factor2', 'Factor3', 'Factor4']

    if factors is None:
        factors = all_factors

    # Convert user input to Series with labels matching data
    new_user = pd.Series(input_to_factor(new_user), index=all_factors)

    # Filter both user and dataset to selected factors
    new_user = new_user[factors] 
    data_filtered = data[factors + ['Cat_sex']]

    for i, row in data_filtered.iterrows(): 
        user2 = np.array(row[factors], dtype=float)
        d = distance(new_user, user2, same_traits, different_traits, factors)
        distances.append((i, d))
    
    all_scores = [dist for _, dist in distances]  # all distances computed
    global_min = min(all_scores)
    global_max = max(all_scores)
    score_range = global_max - global_min if global_max != global_min else 1
    
    if sex != 'all':
        valid_indices = data[data['Cat_sex'] == sex].index
        distances = [(i, d) for i, d in distances if i in valid_indices]
    
    distances.sort(key=lambda x: x[1])
    top_indices = [idx for idx, _ in distances[:n_recs]]
    top_distances = [dist for _, dist in distances[:n_recs]]

    top_rows = data_filtered.loc[top_indices].copy()
    top_rows['match_score'] = top_distances
    top_rows['match_score_normalized'] = [
    (score - global_min) / score_range for score in top_distances
]
    return top_rows

### Testing the recommender

In [8]:
# default params 

In [11]:
data.mean()

Factor1   -28.606068
Factor2    34.033113
Factor3    29.409346
Factor4     2.482399
Cat_sex     0.498191
dtype: float64

In [16]:
user1 = [5, 7, 4, 1]
recommend(user1, data)

Unnamed: 0,Factor1,Factor2,Factor3,Factor4,Cat_sex,match_score,match_score_normalized
735,4.523968,70.473896,29.267831,-8.159772,0,332.631092,0.0
2342,-5.297292,68.603108,18.080727,-11.44097,0,372.56853,0.003363
2340,-6.758517,71.411412,28.916654,-2.391366,0,382.42605,0.004193
2444,-12.351986,84.070904,34.241138,0.37914,1,432.189941,0.008384
2673,0.443577,71.207044,36.898174,-3.360675,0,433.475727,0.008492
1617,-5.099366,73.773906,33.479651,0.422291,0,443.712919,0.009354
171,9.21473,70.863912,30.932767,-7.54695,1,445.612863,0.009514
1179,-0.635393,75.04639,35.585238,0.507805,1,462.687092,0.010952
644,-0.94907,71.772296,17.018696,-4.726829,1,467.53328,0.01136
2360,-9.761901,68.167675,32.180858,-3.116892,1,473.066999,0.011826


In [None]:
# same_traits, different_traits

In [17]:
user2 = [1, 2, 5, 7]
same_traits = ['Factor1', 'Factor2']
different_traits = ['Factor3', 'Factor4']

In [18]:
recommend(user2, data, same_traits=same_traits, different_traits=different_traits, n_recs=20)

Unnamed: 0,Factor1,Factor2,Factor3,Factor4,Cat_sex,match_score,match_score_normalized
2668,-80.762166,9.093899,35.06622,-4.830124,1,4938.533128,0.0
452,-83.750412,11.178013,39.448703,-1.116527,1,5053.116586,0.004048
756,-78.996429,3.296271,36.150363,-0.210805,0,5198.31113,0.009176
646,-70.276722,3.247173,29.44348,-6.693485,1,5255.520273,0.011197
1050,-74.944608,10.208708,36.154815,-2.207527,0,5364.831956,0.015059
1561,-60.18672,8.380523,12.106715,-7.903752,0,5386.830507,0.015836
1372,-71.832823,6.381142,33.990181,-3.64357,0,5401.693134,0.016361
2190,-78.310562,8.383573,40.72235,3.223419,1,5404.778731,0.01647
1418,-72.018765,17.022642,28.973591,-5.148464,0,5469.923161,0.018771
2122,-79.781653,20.168074,35.179704,0.522137,1,5537.858193,0.02117


In [None]:
## Only specific factors 

In [21]:
test = [1, 1, 1, 1]
recommend(test, data)

Unnamed: 0,Factor1,Factor2,Factor3,Factor4,Cat_sex,match_score,match_score_normalized
646,-70.276722,3.247173,29.44348,-6.693485,1,1868.020899,0.0
2668,-80.762166,9.093899,35.06622,-4.830124,1,1908.969819,0.001333
756,-78.996429,3.296271,36.150363,-0.210805,0,1947.92933,0.002601
508,-74.516524,1.411705,31.938798,3.661667,1,2028.359006,0.005219
154,-61.583435,-6.415974,24.422509,-2.379345,1,2142.82896,0.008946
1372,-71.832823,6.381142,33.990181,-3.64357,0,2198.784364,0.010767
1992,-63.63596,2.593219,23.065494,-0.389236,1,2237.473198,0.012026
452,-83.750412,11.178013,39.448703,-1.116527,1,2277.294349,0.013323
172,-63.611452,-1.392689,24.814003,2.419267,0,2284.302254,0.013551
1561,-60.18672,8.380523,12.106715,-7.903752,0,2301.310395,0.014104


In [None]:
ss = ['Factor1', 'Factor2']
sd = ['Factor3', 'Factor4']

In [None]:
ss.append(sd)

In [None]:
ss + sd

In [19]:
user3 = [1, 4, 6, 4]
factors3 = ['Factor1', 'Factor2']

In [20]:
recommend(user3, data, factors=factors3)

Unnamed: 0,Factor1,Factor2,Cat_sex,match_score,match_score_normalized
2099,-82.718573,27.611534,1,365.608202,0.0
537,-80.740541,36.226757,1,404.404223,0.001545
660,-79.920969,29.234997,1,456.007816,0.003601
2116,-78.144993,33.708531,1,510.185863,0.005758
1547,-78.215192,36.437749,1,512.713816,0.005859
2028,-77.897509,28.911576,1,547.520894,0.007245
2122,-79.781653,20.168074,1,630.986227,0.01057
457,-75.465185,35.14304,0,639.549741,0.010911
12,-75.416558,34.968068,1,641.650339,0.010995
815,-75.346197,36.168453,0,648.910443,0.011284


In [None]:
# Specific factors and opposite pairings 

In [None]:
user4 = [3, 5, 1, 7]
factors4 = ['Factor2', 'Factor3']

In [None]:
recommend(user4, data, factors=factors4, different_traits=['Factor2'])

In [None]:
# Specific cat sex

In [None]:
user5 = [2, 2, 4, 5]

In [None]:
recommend(user5, data, sex=0)

In [None]:
# billy 

In [None]:
billy = [7, 1, 3, 6]
billyop = ['Factor1']

In [None]:
recommend(billy, data, different_traits=billyop)

In [None]:
recommend(billy, data)

In [None]:
recommend(dior, data)

In [None]:
dior = [5, 5, 2, 3]

In [None]:
distance(billy, dior)

In [None]:
data.describe()

In [None]:
distance(dior, billy)

In [None]:
input_to_factor(dior)