# PIP (Proximity–Impact–Popularity)

In [1]:
from math import sqrt
import pandas as pd
import numpy as np
import csv
import collections

### Rating (1~5) 일 때, Rmin, Rmax, Rmed 

In [2]:
Rmin=1
Rmax=5
Rmed = (Rmin+Rmax)/2
print("Rmin: {}, Rmax: {}, Rmed: {}".format(Rmin,Rmax,Rmed))

Rmin: 1, Rmax: 5, Rmed: 3.0


### 가게 store id - store name mapping

In [3]:
business = pd.read_csv("business_clean.csv")

In [4]:
stores = {}

In [5]:
id = 0
with open('business_clean.csv','r',encoding="UTF-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        #print(row['name'])
        stores[id] = row['name']
        id+=1

In [6]:
type(stores)

dict

### ui matrix

In [7]:
ui_csv = pd.read_csv("review_top.csv")

In [8]:
ui_csv.head()

Unnamed: 0,user,store,stars
0,Chad,China King Super Buffet,3.0
1,Chad,Panda Express,3.0
2,Margret,Gap,3.0
3,Shannon,Encore Beach Club,2.0
4,Jennifer,Wright Bar,4.0


In [9]:
ui = {}
with open("review_top.csv","r",encoding="UTF-8") as csvfile2:
    reader2 = csv.DictReader(csvfile2)
    for row in reader2:
        user = row['user']
        store = row["store"]
        rating = row['stars']
        ui.setdefault(user,{})
        ui[user][store]=float(rating)

In [10]:
type(ui)

dict

### user_list 를 뽑아 후에 indexing을 쉽게할 수 있음
- ex) user_list[0] = 'Chad'

In [11]:
user_list = [i for i in ui]

In [12]:
user_list[:5]

['Chad', 'Margret', 'Shannon', 'Jennifer', 'Nicole']

### PIP 구현에 있어 필요한 요소들

### -1. Agreement 

In [13]:
def Agreement(r1,r2):
    if (r1>Rmed and r2<Rmed) or (r1<Rmed and r2>Rmed):
        return False
    else:
        return True

### -2. Distance 

In [14]:
def Distance(r1,r2):
    agreement = Agreement(r1,r2)
    if agreement == True:
        return abs(r1-r2)
    else:
        return 2*abs(r1-r2)

### -3. Proximity

In [15]:
def Proximity(r1,r2):
    distance = Distance(r1,r2)
    return ((2*(Rmax - Rmin)+1) - distance)**2

### -4. Impact

In [16]:
def Impact(r1,r2):
    agreement = Agreement(r1,r2)
    if agreement == True:
        return (abs(r1-Rmed)+1)*(abs(r2-Rmed)+1)
    else:
        return 1/((abs(r1-Rmed)+1)*(abs(r2-Rmed)+1))

### -5. avg

In [17]:
def avg(my_list):
    total = 0.0
    for rating in my_list:
        total += rating
    return total/len(my_list)

### -6. Popularity

In [18]:
def Popularity(r1,r2,item):
    allRating = []
    for i in item_all_rating[item]:
        allRating.append(item_all_rating[item][i])
    mean = avg(allRating)
    if (r1>mean and r2>mean) or (r1<mean and r2<mean):
        return 1+(((r1+r2)/2)-mean)**2
    else:
        return 1

### -7. co-rated 찾아서 PIP Similarity 구현

In [19]:
# item 별 user 평점
result = collections.defaultdict(dict)
for person in ui:
    for item in ui[person]:
        # Flip item and person
        result[item][person] =ui[person][item]
item_all_rating = result

In [20]:
def PIP(user1,user2):
    co_rating = {}
    for item in ui[user1]:
        if item in ui[user2]:
            co_rating[item] = 1
    if len(co_rating)==0: return 0
    
    sim = 0
    for item in ui[user1]:
        if item in ui[user2]:
            r1 = ui[user1][item]
            r2 = ui[user2][item]
            sim += Proximity(r1,r2)*Impact(r1,r2)*Popularity(r1,r2,item)
            return(sim)
    #print("finsih")
    #rint("{} and {} sim is {}".format(user1,user2,sim))

### -8. PIP 단일 계산

In [21]:
PIP('Chad','Yi')

171.55555555555551

### -9. Similarity matrix 구현

In [22]:
NumUsers= len(ui)
Sim_mat = np.full((NumUsers,NumUsers),0.0)
Sim_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
for u in range(0,NumUsers):
    for v in range(u,NumUsers):
        a = user_list[u]
        b = user_list[v]
        cal = PIP(a,b)
        Sim_mat[u,v] = cal
        Sim_mat[v,u] = Sim_mat[u,v]
print(Sim_mat)

[[ 101.25        128.          128.         ...    1.5
     0.            0.        ]
 [ 128.          562.04081633  180.89795918 ...    0.
     0.            0.        ]
 [ 128.          180.89795918 1165.         ...    0.
     0.            0.        ]
 ...
 [   1.5           0.            0.         ...   81.
     0.            0.        ]
 [   0.            0.            0.         ...    0.
  1327.71972656    0.        ]
 [   0.            0.            0.         ...    0.
     0.           81.        ]]


# -10. Predicted rating 구현 및 RMSE 로 Evaluation

In [24]:
def basic_CF(mat, Sim, k):
    predicted_rating = np.array([[0.0 for col in range(np.size(mat, 1))] for row in range(np.size(mat, 0))])
        
    k_neighbors = np.argsort(-Sim)
    k_neighbors = np.delete(k_neighbors,np.s_[k:],1)
    
    NumUsers = np.size(mat,axis=0)
    
    for u in range(NumUsers):
        list_sim = Sim[u,k_neighbors[u,]]
        list_rating = mat[k_neighbors[u,],].astype('float64')
        
        predicted_rating[u,] = np.sum(list_sim.reshape(-1,1)*list_rating,axis=0)/np.sum(list_sim)
        
    return predicted_rating

In [30]:
ui_matrix=np.load("ui_matrix.npy")
ui_matrix[np.isnan(ui_matrix)] = 0
pre=basic_CF(ui_matrix, Sim_mat, 3)
print(pre)

[[0.         0.         0.         ... 0.         0.         1.54229857]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [31]:
def RMSE(ui, pre):
    a_total=0
    for i in range(np.size(ui, 0)):
        for j in range(np.size(ui, 1)):
            a=(ui_matrix[i][j]-pre[i][j])**2
            a_total+=a
    total=a_total/(380*21601)
    total=np.sqrt(total)
    return total

In [32]:
print(RMSE(ui_matrix, pre))

0.33208159802355747


# 다른 similarity 측정 방법과 비교

In [33]:
cos=np.load("COS_similarity.npy")
pcc=np.load("PCC_similiarity.npy")
jac=np.load("JAC_similarity.npy")

In [34]:
cos_pre=basic_CF(ui_matrix, cos, 10)
pcc_pre=basic_CF(ui_matrix, pcc, 10)
jac_pre=basic_CF(ui_matrix, jac, 10)

In [35]:
print(RMSE(ui_matrix, pcc_pre))
print(RMSE(ui_matrix, cos_pre))
print(RMSE(ui_matrix, jac_pre))

0.2907707842254746
0.15984225940824584
0.1120672480612189


# 결론:

전통적으로 많이 사용되는 유사도 측정법과 비교했을 때, PIP는 Cosine, PCC, Jaccard보다 더 높은 수치의 RMSE 결과가 나왔다. 이는 다른 방법론들에 비해 PIP의 측정률이 다소 떨어짐을 의미하며 PIP를 통해 추천시스템을 구현하기 위해서는 좀 더 많은 실험과 개선이 필요하다.