In [134]:
import networkx as nx
import scipy as sp
import numpy as np
import sys
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import jaccard_score

In [2]:
users = ["0", "107", "348", "414", "686", "698", "1684", "1912", "3437", "3980"]
PATH = 'data/facebook/'

In [3]:
'''
    Laplacian eigendecomposition process
'''
def user_process(user):
    G = nx.Graph()
    
    with open(PATH + user +".circles") as f:
        for line in f:
            ls = line.rstrip().split("\t")
            for n in ls[1:]:    
                G.add_node(int(n))

    with open(PATH + user + ".edges") as f:
        for line in f:
            line = line.rstrip()
            a, b = line.split(" ")
            G.add_edge(int(a), int(b))
    
    L = nx.laplacian_matrix(G)  # Returns the Laplacian matrix of graph G.
    eigenvalues, eigenvectors = np.linalg.eigh(L.todense())
    return eigenvalues, eigenvectors

In [4]:
'''
    Return the eigen gaps for a user
'''
def get_eigengap(user_ev):
    diffs = []
    for ev in range(1, len(user_ev)):
        diffs.append(user_ev[ev] - user_ev[ev-1])
    return diffs

In [5]:
users_eigenvectors = {}
users_eigenvalues = {}
users_eigengap = {}

In [6]:
'''
    Laplacian eigendecomposition and eigengap for each user
'''
for user in users:
    users_eigenvalues[user], users_eigenvectors[user] = user_process(user)
    users_eigenvalues[user] = sorted(users_eigenvalues[user])
    users_eigengap[user] = get_eigengap(users_eigenvalues[user])



In [42]:
'''
    See the index of First Pick of a user  - users = ["0", "107", "348", "414", "686", "698", "1684", "1912", "3437", "3980"] 
'''
user = "3437"
for i, gap in enumerate(users_eigengap[user][:40]):
    '''print(i+1, gap)'''

##### User "0": Optimal num of Cluster: 14  | User "107": Optimal num of Cluster: 2  | User "348": Optimal num of Cluster: 4
##### User "414": Optimal num of Cluster: 8  | User "686": Optimal num of Cluster: 3  | User "698": Optimal num of Cluster: 7
##### User "1684": Optimal num of Cluster: 11  | User "1912": Optimal num of Cluster: 6  | User "3437": Optimal num of Cluster: 2
##### User "3980": Optimal num of Cluster: 10

In [53]:
def cluster_nodes(user, n_vectors, users_on_cluster, users_eigenvectors):
    eigenvectors = np.transpose(users_eigenvectors[user])[:n_vectors]
    kmeans = KMeans(n_clusters=users_on_cluster)
    k_means = kmeans.fit(np.asarray(np.transpose(eigenvectors)))
    return k_means

In [117]:
def evaluate_similarity(user, resulting_clusters):
    score = 0
    control = set()
    circles = []
    with open(PATH + user +".circles") as f:
        for line in f:
            line = line.rstrip().split("\t")
            for n in line[1:]:
                if n not in control:
                    circles.append(line[0])
                    control.add(n)

    score = adjusted_rand_score(list(circles), resulting_clusters[:len(circles)])
    return score

### Evaluate the similarity of the resulting clusters

In [119]:
users_on_cluster = {"0": 14, "107": 2, "348": 4, "414": 8, "686": 3, "698": 7, "1684": 11, "1912": 6, "3437": 2, "3980": 10}

In [142]:
users_score = {}
n_vectors = 40 # The first n eigenvectors
for user in users:
    k_means_model = cluster_nodes(user, n_vectors, users_on_cluster[user], users_eigenvectors)
    resulting_clusters = k_means_model.labels_
    users_score[user] = evaluate_similarity(user, resulting_clusters)
print(users_score)

{'0': 0.19707871781363231, '107': 0.004910420844326661, '348': -0.015293721980308452, '414': 0.0065681919906997056, '686': 0.0006188686047141599, '698': 0.023402092552852874, '1684': 0.28146483969680974, '1912': 7.353104021431937e-05, '3437': 0.0, '3980': 0.024651877822554413}


##### Save score results

In [143]:
now = datetime.now()
# dd/mm/YY H:M:S
dt_string = now.strftime("%d-%m-%Y_%H:%M:%S")
f = open("results/scores_no_weight_"+dt_string+".txt", "w")
f.write(str(users_score))
f.close()

#  Weighted graph

In [122]:
def get_user_features(users):
    features = {}
    for user in users:
        features[user] = {}
        with open(PATH + user + ".feat", "r") as f:
            for line in f:
                ls = line.rstrip().split(" ")
                features[user][ls[0]] = np.asarray(ls[1:], dtype=np.intc)
    return features

In [123]:
def user_process_weight(user, users_features):
    G = nx.Graph()
            
    with open(PATH + user +".circles") as f:
        for line in f:
            ls = line.rstrip().split("\t")
            for n in ls[1:]:    
                G.add_node(int(n))

    with open(PATH + user + ".edges") as f:
        for line in f:
            line = line.rstrip()
            a, b = line.split(" ")
            G.add_edge(int(a), int(b), weight=jaccard_score(users_features[user][a], users_features[user][b]))
    
    L = nx.laplacian_matrix(G)  # Returns the Laplacian matrix of graph G.
    eigenvalues, eigenvectors = np.linalg.eigh(L.todense())
    return eigenvalues, eigenvectors

In [124]:
users_features = get_user_features(users)

In [125]:
users_eigenvectors_wei = {}
users_eigenvalues_wei = {}
users_eigengap_wei = {}

In [126]:
'''
    Laplacian eigendecomposition and eigengap for each user
'''
for user in users:
    users_eigenvalues_wei[user], users_eigenvectors_wei[user] = user_process_weight(user, users_features)
    users_eigenvalues_wei[user] = sorted(users_eigenvalues_wei[user])
    users_eigengap_wei[user] = get_eigengap(users_eigenvalues_wei[user])



In [127]:
'''
    See the index of First Pick of a user  - users = ["0", "107", "348", "414", "686", "698", "1684", "1912", "3437", "3980"] 
'''
user = "3980"
for i, gap in enumerate(users_eigengap_wei[user][:40]):
    '''print(i+1, gap)'''

##### User "0": Optimal num of Cluster: 15  | User "107": Optimal num of Cluster: 5  | User "348": Optimal num of Cluster: 4
##### User "414": Optimal num of Cluster: 9  | User "686": Optimal num of Cluster: 6  | User "698": Optimal num of Cluster: 6
##### User "1684": Optimal num of Cluster: 11  | User "1912": Optimal num of Cluster: 7  | User "3437": Optimal num of Cluster: 5
##### User "3980": Optimal num of Cluster: 10

In [128]:
users_on_cluster_wei = {"0": 15, "107": 5, "348": 4, "414": 9, "686": 6, "698": 6, "1684": 11, "1912": 7, "3437": 5, "3980": 10}

In [138]:
users_score_wei = {}
n_vectors = 40 # The first n eigenvectors
for user in users:
    k_means_model = cluster_nodes(user, n_vectors, users_on_cluster_wei[user], users_eigenvectors_wei)
    
    resulting_clusters = k_means_model.labels_
    users_score_wei[user] = evaluate_similarity(user, resulting_clusters)
    
print(users_score_wei)

{'0': 0.20745902797058488, '107': 0.0020369601543860543, '348': 0.020774068254074646, '414': 0.5207682901402869, '686': 0.015341890789607465, '698': -0.013327621703610128, '1684': 0.23248408654174363, '1912': 0.016091167008400326, '3437': 0.003675875320655584, '3980': -0.08448764102686365}


##### Save score results

In [139]:
now = datetime.now()
# dd/mm/YY H:M:S
dt_string = now.strftime("%d-%m-%Y_%H:%M:%S")
f = open("results/scores_with_weight_"+dt_string+".txt", "w")
f.write(str(users_score_wei))
f.close()