In [1]:
import time
import json

import vk
import networkx as nx
import matplotlib.pyplot as plt
import scipy as sp
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score

In [2]:
cache_filename = 'graph_vertex_cache.json'

In [3]:
with open("author.txt") as f:
    login = f.readline().rstrip()
    passwd = f.readline().rstrip()

session = vk.AuthSession(app_id='6070120', user_login=login, user_password=passwd)

api = vk.API(session)
api_version = "5.73"

In [4]:
try:
    cache_dict = json.load(open(cache_filename, 'r'))
except FileNotFoundError:
    cache_dict = {}

In [12]:
cache_dict = {int(k): val for k, val in cache_dict.items()}

In [8]:
def get_friends_with_sleep(user_id):
    global requests_count
    if requests_count == 3: # avoid making too many requests
        requests_count = 0
        time.sleep(1)

    try:
        friends = api.friends.get(user_id=user_id, v=api_version)
        requests_count += 1
    except vk.exceptions.VkAPIError as err:
        if err.code == 6: # too many requests per second
            time.sleep(2)
            friends = get_friends_with_sleep(user_id)
        else:
            raise err
    if isinstance(friends, list):
        return friends
    return friends['items']

In [9]:
def construct_graph(user_id, output_graph, api, waves=3):
    output_graph.add_node(user_id)
    if waves != 0:
        global cache_dict
        
        if user_id in cache_dict:
            friends = cache_dict[user_id]
        else:
            try:
                friends = get_friends_with_sleep(user_id)
                cache_dict[user_id] = friends
            except vk.exceptions.VkAPIError as err: 
                if err.code == 18: # deleted or banned user
                    output_graph.remove_node(user_id)
                    friends = []
                else:
                    raise err
            
        for friend in friends:
            if friend not in output_graph.nodes():
                construct_graph(friend, output_graph, api, waves - 1)
            if friend in output_graph.nodes():
                output_graph.add_edge(user_id, friend)

In [10]:
def graph_from_cache(user_id, output_graph):
    output_graph.add_node(user_id)
    
    if user_id in cache_dict:
        friends = cache_dict[user_id]
        
        for friend in friends:
            if friend not in output_graph.nodes():
                graph_from_cache(friend, output_graph)
            output_graph.add_edge(user_id, friend)

In [15]:
# start with my id of 10938483 for now
requests_count = 0
main_graph = nx.Graph()
start_id = 10938483
graph_from_cache(start_id, main_graph)
#construct_graph(start_id, main_graph, api, 4)

In [16]:
len(main_graph.nodes())

1863819

In [33]:
with open(cache_filename, 'w') as f:
    json.dump(cache_dict, f)

In [18]:
adjacency = nx.adjacency_matrix(main_graph)

In [19]:
def silhouette(k_clusters, data, labels):
    labels_dict = dict(zip(data, labels))
    
    s = []
    for vec in data:
        vec_cluster = labels_dict[vec]
        b_i = []
        a_i = 0
        for cl_num in range(k_clusters):
            if vec_cluster == cl_num:
                a_i = np.average([norm(vec - cl_vec) for cl_vec in data if labels_dict[cl_vec] == cl_num])
            else:
                b_i.append(np.average([norm(vec - cl_vec) for cl_vec in data if labels_dict[cl_vec] == cl_num]))
        b_i = min(b_i)
        
        s_i = (b_i - a_i) / max(a_i, b_i)
        s.append(s_i)
    
    return np.average(s)

In [None]:
#adjacency = smth
    
'''
silhouettes = {}
for k in range(2, 10):
    fit_clustering = SpectralClustering(n_clusters=k, affinity='precomputed').fit(adjacency)
    silhouettes[k] = (silhouette(k, adjacency, fit_clustering.labels_), fit_clustering)
'''

silhouettes = {k: SpectralClustering(n_clusters=k, affinity='precomputed').fit(adjacency) for k in range(2, 10)}
print(max(silhouettes, key = lambda k: silhouette(k, adjacency, silhouettes[k].labels_)))