In [72]:
# Imports 

from sklearn.metrics import pairwise_distances
import pandas as pd 
import numpy as np
import os 

In [73]:
data = pd.read_csv('../data/data_reduced.csv')
raw_data = pd.read_csv('../data/data_notscaled.csv')
traits = data[['Factor1', 'Factor2', 'Factor3', 'Factor4']]

In [70]:
import pickle

with open('../objects/scaler.pkl','rb') as f:
    sc = pickle.load(f)

I want to give the user the option to see recommended cats that are most or least like their own cat in each trait. For example, someone may want a cat that is similar to their current cat in terms of dominance and intelligence, but the opposite in terms of activity level. In order to do so I will define a custom distance. For each trait, the distance must be minimized if the user selected to see recommendations that are the most similar in that trait, and maximized if they wish to see the opposite. 

In [74]:
# Defining the maxmimun distance squared for each trait

max_dists_sq = (traits.max() - traits.min()) ** 2

In [75]:
def distance(user1, user2, same_traits=None, different_traits=None):

    user1 = np.array(user1, dtype=float).flatten()
    user2 = np.array(user2, dtype=float).flatten()
    diff = user1 - user2 
    dist = 0.0
    n = len(user1)

    # if no traits are specified to be similar or different, the default is to recommend based on similarities in all traits 
    if same_traits is None and different_traits is None:
        same_traits = range(n)
        different_traits = []

    elif same_traits is None:
        same_traits = [i for i in range(n) if i not in different_traits]
        
    elif different_traits is None:
        different_traits = [i for i in range(n) if i not in same_traits]
        
    for i in same_traits:
        dist += diff[i] ** 2

    for i in different_traits:
        max_val = max_dists_sq.iloc[i]
        dist += (max_val - diff[i] ** 2)

    return dist

In [76]:
def recommend(new_user, data, same_traits=None, different_traits=None, n_recs=10, factors=None):

    new_user = np.array(new_user, dtype=float)
    distances = []
    if factors is None:
        factors=data.columns.tolist()

    for i, row in traits.iterrows(): 
        user2 = np.array(row[factors], dtype=float)

        d = distance(new_user, user2, same_traits, different_traits)
        distances.append((i, d))
    
    # Sort by ascending distance (best matches first)
    distances.sort(key=lambda x: x[1])
    
    return distances[:n_recs]
    

#### Testing the recommender

In [77]:
data[10:11]

Unnamed: 0,Factor1,Factor2,Factor3,Factor4,Cat_sex
10,-11.242259,32.644139,-22.569194,9.048222,1


In [60]:
test1 = [5.658829, -0.810108, 5.02464, 3.281885]
test2 = [0.087177, 0.925191, -1.526079, -3.750073]

same_traits = [0, 2]
diff_traits = [1, 3]

In [78]:
first = recommend(test1, traits)

In [79]:
df1 = pd.DataFrame(data=first)

In [80]:
data.loc[df1[0]]

Unnamed: 0,Factor1,Factor2,Factor3,Factor4,Cat_sex
1208,-4.00667,-1.179039,-6.984522,11.466535,0
1136,0.426436,11.573072,-9.481515,15.719044,0
2504,8.955519,18.409817,-13.452492,-4.661919,1
2347,-2.774636,2.486532,-20.042275,-5.8366,0
985,16.824174,9.634805,-18.140563,-2.132316,1
512,-7.951109,5.371148,-17.381144,-7.09882,1
311,-1.54135,4.688077,-22.258119,-3.638317,0
305,-1.956988,23.379172,-10.752929,3.514511,1
2466,-0.590065,13.012312,-20.410536,-2.731278,1
1348,-17.102112,7.399967,-13.200233,3.695606,0


In [71]:
from sklearn.preprocessing import StandardScaler

test2 = np.array([2, 6, 7, 1])

scaler = StandardScaler()
test3 = sc.transform(test2.reshape(1, -1))



ValueError: X has 4 features, but StandardScaler is expecting 51 features as input.

In [63]:
second = recommend(test3, traits, same_traits=same_traits, different_traits=diff_traits)

In [64]:
df2 = pd.DataFrame(data=second)

In [66]:
test3

array([[-0.78446454],
       [ 0.78446454],
       [ 1.17669681],
       [-1.17669681]])

In [65]:
data.loc[df2[0]]

Unnamed: 0,Factor1,Factor2,Factor3,Factor4,Cat_sex
258,-0.647627,13.850228,-2.908044,4.635665,1
2341,-0.794489,-13.064053,7.194531,4.063943,0
2444,2.816758,15.191407,-2.651132,-1.414312,1
2364,1.224524,13.532865,5.232263,4.94549,1
1865,-1.665936,14.189521,-1.950845,0.808019,1
507,-2.202655,-12.157959,1.268655,1.26901,1
498,-2.1444,13.254115,-1.464134,3.458568,0
996,0.950734,11.538597,0.688967,6.154217,1
958,-2.466524,13.351979,-1.674624,2.552169,0
601,-2.50089,-11.95605,0.673,-1.484812,0


In [103]:
recommend(test4, data)

[(1377, 0.5653610501056178),
 (1742, 0.6253745217185432),
 (1733, 0.6746091782413869),
 (2370, 1.049443787696961),
 (1535, 1.5042388336411647),
 (2199, 1.8389207553460123),
 (1783, 2.2837995509971996),
 (1056, 2.365998475544991),
 (478, 2.735586405991848),
 (1650, 3.021892567706973)]

In [105]:
test4

array([[-0.78446454],
       [ 0.78446454],
       [ 1.17669681],
       [-1.17669681]])

In [113]:
data = data.round(5)
data.describe()

Unnamed: 0,Factor1,Factor2,Factor3,Factor4
count,2764.0,2764.0,2764.0,2764.0
mean,9.044863e-08,2.532562e-08,-1.447178e-08,-2.170767e-08
std,6.536558,4.870924,2.912282,2.568366
min,-15.47531,-14.52431,-7.9289,-8.32992
25%,-5.09949,-3.473265,-1.972415,-1.72528
50%,-0.45581,-0.20756,-0.16884,-0.068865
75%,4.603195,3.230165,1.746265,1.585002
max,23.24204,15.19141,12.44072,10.93867


In [93]:
test4

array([[-0.78446454],
       [ 0.78446454],
       [ 1.17669681],
       [-1.17669681]])

In [26]:
# Example: recommend top 3 matches for each user
test = [-2.678498, 5.816443, -3.505566, -0.007615]

recommend(test, data, same_traits, diff_traits)

ValueError: operands could not be broadcast together with shapes (4,) (5,) 

In [100]:
np.array(data).flatten()

array([-1.09788112,  3.04265881,  0.44111017, ..., -1.57584742,
       -0.20959295, -0.2375467 ])