## Fluid_Algorithm

Parés F., Garcia-Gasulla D. et al. “Fluid Communities: A Competitive and Highly Scalable Community Detection Algorithm”.

In [2]:
from networkx.algorithms import bipartite
import networkx as nx
from networkx import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

In [3]:
# 80000row
user_train = pd.read_csv('ml-100k/u1.base', sep='\t',names=["userID","itemID","rating","timestamp"],header=None, na_filter=False)
user_train = user_train[['userID','itemID','rating']]

# 100000row
user_total = pd.read_csv('ml-100k/u.data', sep='\t',names=["userID","itemID","rating","timestamp"],header=None, na_filter=False)
user_total = user_total[['userID','itemID','rating']]

# 20000row
user_test = pd.read_csv('ml-100k/u1.test', sep='\t',names=["userID","itemID","rating","timestamp"],header=None, na_filter=False)
user_test = user_test[['userID','itemID','rating']]


In [4]:
'''
min_max_scaler = MinMaxScaler()

x_scaled = min_max_scaler.fit_transform(user_train[['rating']])
user_train[['rating']] = x_scaled

x_scaled = min_max_scaler.fit_transform(user_total[['rating']])
user_total[['rating']] = x_scaled

x_scaled = min_max_scaler.fit_transform(user_test[['rating']])
user_test [['rating']] = x_scaled
'''

"\nmin_max_scaler = MinMaxScaler()\n\nx_scaled = min_max_scaler.fit_transform(user_train[['rating']])\nuser_train[['rating']] = x_scaled\n\nx_scaled = min_max_scaler.fit_transform(user_total[['rating']])\nuser_total[['rating']] = x_scaled\n\nx_scaled = min_max_scaler.fit_transform(user_test[['rating']])\nuser_test [['rating']] = x_scaled\n"

In [5]:
user_item_total = user_total.pivot_table('rating', index = 'userID',columns = 'itemID').fillna(0)
user_item_train = user_train.pivot_table('rating', index = 'userID',columns = 'itemID').fillna(0)
user_item_test = user_test.pivot_table('rating', index = 'userID',columns = 'itemID').fillna(0)

In [6]:
## user-item node 이름 설정
matrix = pd.read_csv('ml-100k/u.data', sep='\t',names=["userID","itemID","rating","timestamp"],header=None, na_filter=False)
matrix[['userID']] = 'u' + matrix[['userID']].astype(str)
matrix[['itemID']] = 'i' + matrix[['itemID']].astype(str)
matrix

Unnamed: 0,userID,itemID,rating,timestamp
0,u196,i242,3,881250949
1,u186,i302,3,891717742
2,u22,i377,1,878887116
3,u244,i51,2,880606923
4,u166,i346,1,886397596
...,...,...,...,...
99995,u880,i476,3,880175444
99996,u716,i204,5,879795543
99997,u276,i1090,1,874795795
99998,u13,i225,2,882399156


### 그래프 생성

#### G : user로만 이루어진 그래프  /  B : user, item으로 이루어진 그래프

In [7]:
user_node = matrix[['userID']].values
user_node = np.array(user_node).flatten().tolist()
#user_node

In [8]:
G = nx.Graph()
G.add_nodes_from(user_node)
info(G)

'Graph with 943 nodes and 0 edges'

In [9]:
edgelist = []

for i in matrix.values:
    edgelist.append((i[0],i[1]))
    
B = nx.Graph() # >300
B.add_nodes_from(matrix.userID, bipartite=0) # user
B.add_nodes_from(matrix.itemID, bipartite=1) # item(movie)
B.add_edges_from(edgelist)

info(B)

'Graph with 2625 nodes and 100000 edges'

###  Link Prediction으로 상위 0.5 % 가능성을 가진  edge 생성

common_neighbor_centrality(2020) 알고리즘 사용

* uesr graph 연결될 수 있는 최대 edge 개수 444153 [ = 943*942/2 ]
* total graph 연결될 수 있는 최대 edge 개수 3444000 [ = 2625*2624/2 ]

In [10]:
for i in range(10):
    
    user_edgelist = []
    total_edgelist = []
    
    pred_link = list(nx.common_neighbor_centrality(B, alpha=0.8))
    
    link_likelihood = pd.DataFrame(pred_link).iloc[:,2].values
    link_99 = np.percentile(link_likelihood,  99.5, interpolation='linear')
    
    for p in pred_link:
        edge = list(p)
        linklihood = int(edge[2])
        
        if linklihood > link_99:
            if edge[0][:1] ==  edge[1][:1] == 'u': # user R
                user_edgelist.append((edge[0], edge[1]))
                total_edgelist.append((edge[0], edge[1]))
            else :
                total_edgelist.append((edge[0], edge[1]))
                
    B.add_edges_from(total_edgelist)
    G.add_edges_from(user_edgelist)
    
    print('total_graph connected : ', is_connected(B),' user graph info : ', info(B))
    print('user_graph connected : ', is_connected(G),' user graph info : ', info(G))
    
    ## total graph가 다 연결될 때 까지 반복
    if is_connected(B):
        break
    
#print('total graph: ', info(B))
#print('user_graph connected : ', is_connected(G),' user graph info : ', info(G))

total_graph connected :  True  user graph info :  Graph with 2625 nodes and 116425 edges
user_graph connected :  False  user graph info :  Graph with 943 nodes and 8702 edges


###  Graph Clustering


변수명 설명
* cluster : 각 그룹 넘버와 그룹에 해당되는 user, item 리스트
* [return] user_c : 각 user 당 해당하는 cluster number


In [11]:
## network clustering
from networkx.algorithms.community import asyn_fluidc
from networkx.algorithms.community import coverage, performance

coverage_list, performance_list = [], [] 

def graph_clustering(cluster_num):
    cluster = [0] * cluster_num
    total_graph = B
    
    community = list(asyn_fluidc(total_graph, k=cluster_num))
    
    for i, comms in enumerate(community):
        cluster[i] = comms
        
    coverage_list.append(coverage(total_graph, community))
    performance_list.append(performance(total_graph, community))

                
    ## user cluster 정보
    cluster_id = pd.read_csv('ml-100k/u.user', sep='|',names=["userID","age","gender","occupation","zip code"],header=None,na_filter=False)
    cluster_id = cluster_id[['userID']]
    cluster_id= cluster_id.set_index('userID')
    cluster_id['cluster'] = 999

    cluster_cnt = [] # 각 cluster에 속한 user의 수 
    
    # 각 user와 cluster matching
    for i in range(cluster_num):
        cnt = 0
        
        for j in list(cluster[i]):
            if j[0] == 'u':
                cnt += 1
                cluster_id.iloc[int(j[1:])-1] = i 
                
        cluster_cnt.append(cnt)
            
    # 각 클러스터당 user의 인원 수
    #print('cluster num : ', cluster_num, " -> ",cluster_cnt) 
    #print(user_c)
    
    return (cluster_id)
        

###  Group Recommender System

In [20]:
import numpy as np
from sklearn.metrics import ndcg_score

def grs_ndcg(num, total_matrix, train_matrix):
    total_matrix  # user_item_total 
    train_matrix  # user_item_train
    test_matrix = user_item_test
    
    ## 1. fluid algorithm으로 그룹 clustering(total 대상)
    cluster_id = graph_clustering(num)
    
    # 각 클러스터에 해당하는 개수
    length = [1]*num
    for i in range(num):
        length[i] += len(cluster_id[cluster_id.cluster==i]) 
    
    
    # train, test 에 cluster 정보 추가
    user_item_train_cl = pd.concat([train_matrix, cluster_id], axis=1, join='inner')
    user_item_test_cl = pd.concat([test_matrix, cluster_id], axis=1, join='inner')
    
    ## 2. 클러스터 별로 각 item의 mean 값 구함 (train 대상)
    mean_rating = pd.DataFrame(columns = user_item_train_cl.columns)
    mean_rating.set_index('cluster')
    
    for i in range(num):
        mean_rating = mean_rating.append(user_item_train_cl[user_item_train_cl.cluster == i].mean(axis=0), ignore_index=True)
    
    mean_rating = mean_rating.set_index('cluster')
    mean_rating
    
    ## 3. train-test set의 columns(item id) 맞추기 (miss matching 제거)
    for c in user_item_train_cl.columns:
        if c not in user_item_test_cl.columns:
            del mean_rating[c]
        
    for c in user_item_test_cl.columns:
        if c not in user_item_train_cl.columns:
            del user_item_test_cl[c] 
            
    y_pred = mean_rating 
    y_true = user_item_test_cl
    
    result = [0]*num # 결과값 저장 리스트
    
    ## 4. 각 결과 값에 nDCG 더해줌
    for idx in test_matrix.index:
        cluster_num = int(y_true.loc[idx].cluster)
        result[cluster_num] += ndcg_score([y_true.loc[idx][:-1]], [y_pred.loc[cluster_num]])
        #result[cluster] += ndcg_score([user_item_test_cl.loc[idx][:-1]], [mean_rating.loc[cluster]], k=4)
    
    ## 5. 최종적으로 각 nDCG값 / 각 cluster의 요소 개수
    for i in range(num):
        result[i] = result[i]/length[i]
        
    print(length)
    
    #print("cluster수:",len(length),"/ NDCG:",sum(result)/len(length))  
    print('%.5f'%(sum(result)/(len(length))))
    

In [21]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import numpy as np

cluster_num = range(2,31)
#cluster_num = [2]

for i in cluster_num:
    grs_ndcg(i, user_item_total, user_item_train)

[651, 294]
0.24735
[520, 149, 277]
0.25562
[548, 19, 297, 83]
0.25119
[14, 402, 256, 273, 3]
0.24426
[369, 31, 437, 71, 38, 3]
0.26463
[20, 370, 40, 64, 211, 2, 243]
0.24201
[134, 247, 222, 299, 10, 4, 33, 2]
0.22872
[140, 252, 280, 3, 225, 10, 29, 11, 2]
0.24533
[7, 51, 259, 145, 9, 229, 218, 3, 29, 3]
0.23591
[161, 8, 354, 93, 75, 213, 14, 7, 18, 10, 1]
0.19816
[42, 345, 43, 9, 2, 185, 65, 215, 14, 30, 3, 2]
0.23841
[73, 134, 25, 91, 24, 238, 10, 2, 185, 166, 4, 2, 2]
0.21094
[327, 206, 3, 2, 14, 29, 2, 191, 67, 13, 51, 33, 12, 7]
0.24415
[168, 27, 312, 11, 51, 13, 2, 7, 210, 11, 72, 43, 17, 10, 4]
0.21532
[196, 95, 20, 280, 3, 5, 63, 182, 6, 2, 46, 20, 27, 5, 4, 5]
0.21290
[248, 148, 9, 161, 97, 7, 2, 54, 2, 72, 55, 41, 6, 23, 28, 5, 2]
0.20892
[22, 267, 9, 154, 2, 87, 21, 96, 18, 54, 24, 15, 167, 8, 4, 3, 3, 7]
0.21445
[24, 8, 10, 6, 2, 29, 16, 269, 154, 2, 179, 27, 63, 15, 51, 11, 83, 7, 6]
0.21715
[314, 39, 28, 34, 61, 10, 150, 3, 29, 10, 24, 169, 5, 16, 9, 6, 33, 17, 4, 2]
0.223

In [None]:
plt.figure(figsize=(9,6))
plt.plot(coverage_list, 'ob--', label='coverage')
plt.plot(performance_list, 'or--', label='performance')
plt.legend(fontsize=15)
plt.show()

## precision & recall : binary (2.5 이상 1, 나머지 0)

In [None]:
## precision & recall : binary (2.5 이상 1, 나머지 0)
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def groupRS_pre_rec(num, total_matrix, train_matrix):
    total_matrix  # user_item_total 
    train_matrix  # user_item_train
    test_matrix = user_item_test
    
    ## 1. fluid algorithm으로 그룹 clustering(total 대상)
    cluster_id = graph_clustering(num)
    
    # 각 클러스터에 해당하는 개수
    length = [1]*num
    for i in range(num):
        length[i] += len(cluster_id[cluster_id.cluster==i]) 
    
    
    # train, test 에 cluster 정보 추가
    user_item_train_cl = pd.concat([train_matrix, cluster_id], axis=1, join='inner')
    user_item_test_cl = pd.concat([test_matrix, cluster_id], axis=1, join='inner')
    
    ## 2. 클러스터 별로 각 item의 mean 값 구함 (train 대상)
    mean_rating = pd.DataFrame(columns = user_item_train_cl.columns)
    mean_rating.set_index('cluster')
    
    for i in range(num):
        mean_rating = mean_rating.append(user_item_train_cl[user_item_train_cl.cluster == i].mean(axis=0), ignore_index=True)
    
    mean_rating = mean_rating.set_index('cluster')
    mean_rating
    
    ## 3. train-test set의 columns(item id) 맞추기 (miss matching 제거)
    for c in user_item_train_cl.columns:
        if c not in user_item_test_cl.columns:
            del mean_rating[c]
        
    for c in user_item_test_cl.columns:
        if c not in user_item_train_cl.columns:
            del user_item_test_cl[c] 
            
    y_pred = mean_rating 
    y_true = user_item_test_cl
    
    ## binary로 만들기 위해 2개의 분류로 생성
    y_pred[y_pred< 2.5] = 0
    y_pred[y_pred >= 2.5] = 1
    
    y_true[y_true< 2.5] = 0
    y_true[y_true >= 2.5] = 1
    
    pre_result = [0]*num # 결과값 저장 리스트
    rec_result = [0]*num
    
    ## 4. 각 결과 값에 precision, recall더해줌
    for idx in test_matrix.index:
        cluster_num = int(y_true.loc[idx].cluster)
        pre_result[cluster_num] += precision_score(list(y_true.loc[idx][:-1].values) ,list(y_pred.loc[cluster_num].values ), average = 'binary')
        rec_result[cluster_num] += recall_score(list(y_true.loc[idx][:-1].values) ,list(y_pred.loc[cluster_num].values ), average = 'binary')
    
    print(pre_result)
    print(rec_result)
    #print(sum(pre_result)/num)
    
    #print(sum(rec_result)/num)
    ## 5. 최종적으로 각 precision, recall값 / 각 cluster의 요소 개수
    #for i in range(num):
    #    result[i] = result[i]/length[i]

    
    #print("cluster수:",len(length),"/ NDCG:",sum(result)/len(length))  
    print('%.5f'%(sum(pre_result)/num), ", ",  '%.5f'%(sum(rec_result)/num))

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

cluster_num = range(2,31)
#cluster_num=[2]

for i in cluster_num:
    #reduced_total = SVD.fit_transform(user_item_total)
    #reduced_total = pd.DataFrame(reduced_total)
    #reduced_total.index = total_user_idx
    
    #user_item_train = pd.DataFrame(user_item_train)
    
    groupRS_pre_rec(i, user_item_total, user_item_train)

## precision & recall : multiclass(반올림)

In [None]:
## precision & recall : multiclass(반올림)
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import math

def groupRS_pre_rec2(num, total_matrix, train_matrix):
    total_matrix  # user_item_total 
    train_matrix  # user_item_train
    test_matrix = user_item_test
    
    
   ## 1. fluid algorithm으로 그룹 clustering(total 대상)
    cluster_id = graph_clustering(num)
    
    # 각 클러스터에 해당하는 개수
    length = [1]*num
    for i in range(num):
        length[i] += len(cluster_id[cluster_id.cluster==i]) 
    
    
    # train, test 에 cluster 정보 추가
    user_item_train_cl = pd.concat([train_matrix, cluster_id], axis=1, join='inner')
    user_item_test_cl = pd.concat([test_matrix, cluster_id], axis=1, join='inner')
    
    ## 2. 클러스터 별로 각 item의 mean 값 구함 (train 대상)
    mean_rating = pd.DataFrame(columns = user_item_train_cl.columns)
    mean_rating.set_index('cluster')
    
    for i in range(num):
        mean_rating = mean_rating.append(user_item_train_cl[user_item_train_cl.cluster == i].mean(axis=0), ignore_index=True)
    
    mean_rating = mean_rating.set_index('cluster')
    mean_rating
    
    ## multiuclass 1~5점으로 반올림하여 다중 분류 생성
    for i in mean_rating.index:
        for j in mean_rating.columns:
             mean_rating.loc[i][j] = round(mean_rating.loc[i][j])

    user_item_test_cl = user_item_test_cl.round(0)
    

    ## 3. train-test set의 columns(item id) 맞추기 (miss matching 제거)
    for c in user_item_train_cl.columns:
        if c not in user_item_test_cl.columns:
            del mean_rating[c]
        
    for c in user_item_test_cl.columns:
        if c not in user_item_train_cl.columns:
            del user_item_test_cl[c] 
            
    y_pred = mean_rating 
    y_true = user_item_test_cl
    
    
    pre_result = [0]*num # 결과값 저장 리스트
    rec_result = [0]*num
    
    ## 4. 각 결과 값에 nDCG 더해줌
    for idx in test_matrix.index:
        cluster_num = int(y_true.loc[idx].cluster)
        pre_result[cluster_num] += precision_score(list(y_true.loc[idx][:-1].values) ,list(y_pred.loc[cluster_num].values ), average = 'macro')
        rec_result[cluster_num] += recall_score(list(y_true.loc[idx][:-1].values) ,list(y_pred.loc[cluster_num].values ), average = 'macro')
    
    print(pre_result)
    print(rec_result)
    #print(sum(pre_result)/num)
    
    #print(sum(rec_result)/num)
    ## 5. 최종적으로 각 precision, recall값 / 각 cluster의 요소 개수
    #for i in range(num):
    #    result[i] = result[i]/length[i]

    
    #print("cluster수:",len(length),"/ NDCG:",sum(result)/len(length))  
    print('%.5f'%(sum(pre_result)/num), ", ",  '%.5f'%(sum(rec_result)/num))

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

cluster_num = range(2,31)
#cluster_num=[2]

for i in cluster_num:
    #reduced_total = SVD.fit_transform(user_item_total)
    #reduced_total = pd.DataFrame(reduced_total)
    #reduced_total.index = total_user_idx
    
    #user_item_train = pd.DataFrame(user_item_train)
    
    groupRS_pre_rec2(i, user_item_total, user_item_train)