### Sample script for user-based collaborative filtering  

#### Import libraries

In [1]:
import numpy as np
import pandas as pd

#### Parameters  

In [2]:
csv_in = 'df.csv'
# min number of common items between target user and users in DB
min_common_items = 2
# min number of users who evaluated an item to be recommended
min_eval_users = 2

# similarity  # <-------
#similarity = 'pearson'  # <-------
similarity = 'pearson2'  # <-------

# To show more rows and columns
pd.options.display.max_rows = 999 
pd.options.display.max_columns = 999 

#### Read CSV file  

In [3]:
df = pd.read_csv(csv_in, sep=' ', skiprows=0, header=0)
print(df.shape)
display(df.info())
display(df.head())

(20, 15)
<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 1 to 20
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   a       20 non-null     int64
 1   b       20 non-null     int64
 2   c       20 non-null     int64
 3   d       20 non-null     int64
 4   e       20 non-null     int64
 5   f       20 non-null     int64
 6   g       20 non-null     int64
 7   h       20 non-null     int64
 8   i       20 non-null     int64
 9   j       20 non-null     int64
 10  k       20 non-null     int64
 11  l       20 non-null     int64
 12  m       20 non-null     int64
 13  n       20 non-null     int64
 14  o       20 non-null     int64
dtypes: int64(15)
memory usage: 2.5 KB


None

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o
1,0,3,3,0,1,4,4,0,1,0,4,0,5,1,1
2,0,0,0,0,0,2,0,0,0,0,5,0,0,0,0
3,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,2,0,0,5,0,0,0,0,2,0
5,0,1,0,0,0,0,0,1,0,0,0,0,0,2,5


#### Replace -1 into NaN (for not to be used during calculation)  

In [4]:
df = df.replace(0, np.nan)
display(df.head())

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o
1,,3.0,3.0,,1.0,4.0,4.0,,1.0,,4.0,,5.0,1.0,1.0
2,,,,,,2.0,,,,,5.0,,,,
3,3.0,,,,,,,,,1.0,,,,,
4,,,,,,2.0,,,5.0,,,,,2.0,
5,,1.0,,,,,,1.0,,,,,,2.0,5.0


#### Return Series of similiarity of users  
by using Pearson correlation coefficient (-1 .. 1).  

In [5]:
def get_sim_ser_by_pearson(df_users, ser_target):
    corr_all = df_users.corrwith(ser_target, axis=1).dropna()
    return corr_all

In [6]:
def get_sim_ser_by_pearson2(df_users, ser_target):  # <------
    corr_all = df_users.corrwith(ser_target, axis=1).dropna()  # <------
    corr_all = (corr_all+1)/2  # <------
    return corr_all  # <------

In [7]:
def predict_scores(df_sim, ser_sim, ser_target):
    ret = {}
    df_sim_ave = df_sim.mean(axis=1)
    #print(df_sim.head())  # debug
    target_ave = ser_target.mean()
    #print(target_ave)  # debug
    for item1 in df_sim.columns:
        if item1 in ser_target.index: continue
        v1 = df_sim[item1]
        if v1.notnull().sum() < min_eval_users: continue
        v1 -= df_sim_ave
        v11 = v1[ v1.notnull() ]
        t11 = ser_sim[ v1.notnull() ]
        pred1 = target_ave + (v11 * t11).sum() / np.abs(t11).sum()
        ret[item1] = pred1
    
    ser_ret = pd.Series(ret)
    
    return ser_ret.sort_values(ascending=False)

In [8]:
def get_recomm_by_user_sim(df, target_dic):
    ser_target = pd.Series(target_dic)
    # make dataframe with columns included in target_dic
    df_scores = df[ ser_target.index ]
    #display(df_scores)  # debug
    n_same_items = df_scores.notnull().sum(axis=1)
    #display(df_scores.notnull())  # debug
    df_common = df_scores[ n_same_items>=min_common_items ]
    #display(df_common)  # debug

    if similarity == 'pearson':  # <------
        ser_sim = get_sim_ser_by_pearson(df_common, ser_target)  # <------    
    elif similarity == 'pearson2':  # <------
        ser_sim = get_sim_ser_by_pearson2(df_common, ser_target)  # <------  

    #print(ser_sim)  # debug
    df_sim = df.loc[ ser_sim.index ]
    #display(df_sim)  # debug
    recomm = predict_scores(df_sim, ser_sim, ser_target)

    return recomm

#### Do recommendation  

In [9]:
recomm = get_recomm_by_user_sim(df,
                                {'a':1, 'c':1, 'd':3,
                                 'f':4, 'g':4})
print('Number of items calculated:', len(recomm))
print('Recommendation:')
print(recomm.head())


Number of items calculated: 6
Recommendation:
m    4.140462
k    3.096296
b    2.349509
i    0.976651
e    0.960000
dtype: float64


  c /= stddev[:, None]
  c /= stddev[None, :]
