## Fluid_Algorithm

Parés F., Garcia-Gasulla D. et al. “Fluid Communities: A Competitive and Highly Scalable Community Detection Algorithm”.

In [1]:
from networkx.algorithms import bipartite
import networkx as nx
from networkx import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
# 80000row
user_train = pd.read_csv('ml-100k/u1.base', sep='\t',names=["userID","itemID","rating","timestamp"],header=None, na_filter=False)
user_train = user_train[['userID','itemID','rating']]

# 100000row
user_total = pd.read_csv('ml-100k/u.data', sep='\t',names=["userID","itemID","rating","timestamp"],header=None, na_filter=False)
user_total = user_total[['userID','itemID','rating']]

# 20000row
user_test = pd.read_csv('ml-100k/u1.test', sep='\t',names=["userID","itemID","rating","timestamp"],header=None, na_filter=False)
user_test = user_test[['userID','itemID','rating']]


In [3]:
'''
min_max_scaler = MinMaxScaler()

x_scaled = min_max_scaler.fit_transform(user_train[['rating']])
user_train[['rating']] = x_scaled

x_scaled = min_max_scaler.fit_transform(user_total[['rating']])
user_total[['rating']] = x_scaled

x_scaled = min_max_scaler.fit_transform(user_test[['rating']])
user_test [['rating']] = x_scaled
'''

"\nmin_max_scaler = MinMaxScaler()\n\nx_scaled = min_max_scaler.fit_transform(user_train[['rating']])\nuser_train[['rating']] = x_scaled\n\nx_scaled = min_max_scaler.fit_transform(user_total[['rating']])\nuser_total[['rating']] = x_scaled\n\nx_scaled = min_max_scaler.fit_transform(user_test[['rating']])\nuser_test [['rating']] = x_scaled\n"

In [4]:
user_item_total = user_total.pivot_table('rating', index = 'userID',columns = 'itemID').fillna(0)
user_item_train = user_train.pivot_table('rating', index = 'userID',columns = 'itemID').fillna(0)
user_item_test = user_test.pivot_table('rating', index = 'userID',columns = 'itemID').fillna(0)

total_user_idx = user_item_total.index

In [5]:
## user-item node 이름 설정
matrix = pd.read_csv('ml-100k/u.data', sep='\t',names=["userID","itemID","rating","timestamp"],header=None, na_filter=False)
matrix[['userID']] = 'u' + matrix[['userID']].astype(str)
matrix[['itemID']] = 'i' + matrix[['itemID']].astype(str)
matrix

Unnamed: 0,userID,itemID,rating,timestamp
0,u196,i242,3,881250949
1,u186,i302,3,891717742
2,u22,i377,1,878887116
3,u244,i51,2,880606923
4,u166,i346,1,886397596
...,...,...,...,...
99995,u880,i476,3,880175444
99996,u716,i204,5,879795543
99997,u276,i1090,1,874795795
99998,u13,i225,2,882399156


### 그래프 생성

#### G : user로만 이루어진 그래프  /  B : user, item으로 이루어진 그래프

In [6]:
user_node = matrix[['userID']].values
user_node = np.array(user_node).flatten().tolist()
#user_node

In [7]:
G = nx.Graph()
G.add_nodes_from(user_node)
info(G)


  info(G)


'Graph with 943 nodes and 0 edges'

In [8]:
edgelist = []

for i in matrix.values:
    edgelist.append((i[0],i[1]))
    
B = nx.Graph() # >300
B.add_nodes_from(matrix.userID, bipartite=0) # user
B.add_nodes_from(matrix.itemID, bipartite=1) # item(movie)
B.add_edges_from(edgelist)

info(B)


  info(B)


'Graph with 2625 nodes and 100000 edges'

###  Link Prediction으로 상위 0.5 % 가능성을 가진  edge 생성

common_neighbor_centrality(2020) 알고리즘 사용

* uesr graph 연결될 수 있는 최대 edge 개수 444153 [ = 943*942/2 ]
* total graph 연결될 수 있는 최대 edge 개수 3444000 [ = 2625*2624/2 ]

In [None]:
for i in range(10):
    
    user_edgelist = []
    total_edgelist = []
    
    pred_link = list(nx.common_neighbor_centrality(B, alpha=0.8))
    
    link_likelihood = pd.DataFrame(pred_link).iloc[:,2].values
    link_99 = np.percentile(link_likelihood,  99.5, interpolation='linear')
    
    for p in pred_link:
        edge = list(p)
        linklihood = int(edge[2])
        
        if linklihood > link_99:
            if edge[0][:1] ==  edge[1][:1] == 'u': # user R
                user_edgelist.append((edge[0], edge[1]))
                total_edgelist.append((edge[0], edge[1]))
            else :
                total_edgelist.append((edge[0], edge[1]))
                
    B.add_edges_from(total_edgelist)
    G.add_edges_from(user_edgelist)
    
    print('total_graph connected : ', is_connected(B),' user graph info : ', info(B))
    print('user_graph connected : ', is_connected(G),' user graph info : ', info(G))
    
    ## total graph가 다 연결될 때 까지 반복
    if is_connected(B):
        break
    
#print('total graph: ', info(B))
#print('user_graph connected : ', is_connected(G),' user graph info : ', info(G))

###  Graph Clustering


변수명 설명
* cluster : 각 그룹 넘버와 그룹에 해당되는 user, item 리스트
* [return] user_c : 각 user 당 해당하는 cluster number


In [None]:
## network clustering
from networkx.algorithms.community import asyn_fluidc
from networkx.algorithms.community import coverage, performance

coverage_list, performance_list = [], [] 

def graph_clustering(cluster_num):
    cluster = [0] * cluster_num
    total_graph = B
    
    community = list(asyn_fluidc(total_graph, k=cluster_num))
    
    for i, comms in enumerate(community):
        cluster[i] = comms
        
    coverage_list.append(coverage(total_graph, community))
    performance_list.append(performance(total_graph, community))

                
    ## user cluster 정보
    user_c = pd.read_csv('ml-100k/u.user', sep='|',names=["userID","age","gender","occupation","zip code"],header=None,na_filter=False)
    user_c = user_c[['userID']]
    user_c = user_c.set_index('userID')
    user_c['cluster'] = 999
    
    
    cluster_cnt = [] # 각 cluster에 속한 user의 수 
    
    # 각 user와 cluster matching
    for i in range(cluster_num):
        cnt = 0
        
        for j in list(cluster[i]):
            if j[0] == 'u':
                cnt += 1
                user_c.iloc[int(j[1:])-1] = i 
                
        cluster_cnt.append(cnt)
            
    # 각 클러스터당 user의 인원 수
    #print('cluster num : ', cluster_num, " -> ",cluster_cnt) 
    #print(user_c)
    
    return (user_c)
        

###  Group Recommender System

In [None]:
import numpy as np
from sklearn.metrics import ndcg_score

def grs(cluster_num, default_matrix,  save_rating):
    
    user_item_total = default_matrix
    user_item_train = save_rating
    kcluster = graph_clustering(cluster_num) # userID - cluster
    
    #AVG용 - user_item_train 에 user의 rating 평균값 열 추가
    user_item_train["mean"] = user_item_train.mean(axis=1)
    # user_item_test 에 user의 rating 평균값 열 추가
    user_item_test["mean"] = user_item_test.mean(axis=1)
    
    
    #train, test셋에 cluster 할당
    cluster = user_item_train 
    cluster["cluster"] = np.nan # train
    user_item_test["cluster"] = np.nan # test

    
    # test set에 할당
    for i in kcluster.index:
        if i in cluster.index:
            cluster["cluster"][i] = kcluster["cluster"][i] # train
        if i in user_item_test.index:
            user_item_test["cluster"][i] = kcluster["cluster"][i] # test

    cluster_user_matrix = pd.DataFrame(cluster)

    # (user_test)cluster 수 및 cluster별 인원 수 저장하는 list
    length = [1]*cluster_num
    for i in range(cluster_num):
        length[i] = sum(user_item_test["cluster"] == i)
    #length.append(user_test["cluster"].value_counts())
    #print(length)

    clusters = []

    # user-item 정보 클러스터 별로 저장
    for i in range(len(length)):
        clusters.append(cluster_user_matrix[cluster_user_matrix["cluster"]==i])
        
        '''
    sum_point_idx = []
    for i in range(len(length)):
        sum_point_idx.append(i)
        '''
    
    # predict 저장할 DataFrame
    sum_point = pd.DataFrame(index=range(0, cluster_num, 1), columns = cluster_user_matrix.columns).fillna(0)

    for i in range(cluster_num):
        # clusters[i] = clusters[i].replace({'0':np.nan, 0:np.nan})
        #sum_point.loc[i] = clusters[i].max(axis=0, skipna=True)
        sum_point.loc[i] = clusters[i].mean(axis=0, skipna=True)
    
    sum_point = sum_point.replace({np.nan:0})

    #point = sum_point[sum_point.index < len(length)]
    
    
    ## scores -> y_pred // tmp => y_true

    y_pred = sum_point.drop(["cluster"], axis=1)
    cluster_user_matrix.drop(["mean"], axis=1)

    # 정답 셋
    y_true = pd.DataFrame(columns = user_item_test.columns)
    y_true["num"] = np.nan

    # 정답 셋 정보 할당
    for i in user_item_test.index:
        idx = 0
        while idx < len(length):
            if user_item_test["cluster"][i] == idx:
                y_true.loc[i] = user_item_test.loc[i]
                y_true["num"][i] = idx
            idx += 1

    number = pd.DataFrame(y_true["num"])

    # 평가
    y_true = y_true.drop(["mean","cluster","num"], axis=1)

    # miss matching 제거
    for i in y_pred.columns:
        if i not in y_true.columns:
            y_pred = y_pred.drop([i], axis=1)

    for i in y_true.columns:
        if i not in y_pred.columns:
            y_true = y_true.drop([i], axis=1)
        
    # ndcg
    result = []
    for i in range(len(length)):
        result.append(0)

    for i in y_true.index:
        idx = 0

        while idx < len(length):
            if number["num"][i] == idx:
                result[idx] += ndcg_score([y_true.loc[i]], [y_pred.loc[idx]])
            idx += 1

    for i in range(len(length)):
        if length[i] >0:
            result[i] = result[i]/length[i]
        else:
            result[i] = 0

    print('%.5f'%(sum(result)/(len(length))))
    #print("cluster수:",len(length),"/ NDCG:",'%.5f'%(sum(result)/(len(length))))

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import numpy as np

cluster_num = range(2,51)
#cluster_num = [2]

for i in cluster_num:
    grs(i, user_item_total, user_item_train)

In [None]:
plt.figure(figsize=(9,6))
plt.plot(coverage_list, 'ob--', label='coverage')
plt.plot(performance_list, 'or--', label='performance')
plt.legend(fontsize=15)
plt.show()