In [38]:
import pandas as pd
import numpy as np
import random
import networkx as nx

In [2]:
df = pd.read_csv("vk.csv", delimiter=',')

In [34]:
df.head()

Unnamed: 0,u,v,t,h
0,3248374,11431799,27,6.0
1,3429130,4795235,13,8.0
2,105512,8910840,74,2.0
3,7900466,9739979,36,9.0
4,7296485,9838285,52,2.0


## Число вершин

In [4]:
vertices = len(set([*df["u"].unique(), *df["v"].unique()]))
print(vertices)

3103653


## Число всех ребер

In [84]:
print(f'Число ребер в графе: {len(df["u"])}')

Число ребер в графе: 14847753


In [85]:
print(f'Теоретически возможное число ребер в данном графе: {vertices*(vertices-1)}')

Теоретически возможное число ребер в данном графе: 9632658840756


## Число уникальных ребер

In [91]:
# unique_edges = len(np.unique(df[['u', 'v', 't', 'h']].values, axis=0))
unique_edges = np.unique(df[['u', 'v']].values, axis=0)
print(f'Число уникальных ребер: {len(unique_edges)}')

Число уникальных ребер: 14847753


## Плотность графа
Для неориентированного простого графа плотность графа с числом вершин V  определяется как отношение числа его рёбер E к числу рёбер полного графа

In [8]:
density = 2*unique_edges/((vertices*(vertices-1)))
print(f'Плотность: {density}')

Плотность: 6.444181303023101e-07


## Компоненты слабой связности

In [9]:
class Node:
    def __init__(self, key):
        self.key = key
        self.parent = self
        self.size = 1

class UnionFind(dict):
    def find(self, key):
        node = self.get(key, None)
        if node is None:
            node = self[key] = Node(key)
        else:
            while node.parent != node: 
                # walk up & perform path compression
                node.parent, node = node.parent.parent, node.parent
        return node

    def union(self, key_a, key_b):
        node_a = self.find(key_a)
        node_b = self.find(key_b)
        if node_a != node_b:  # disjoint? -> join!
            if node_a.size < node_b.size:
                node_a.parent = node_b
                node_b.size += node_a.size
            else:
                node_b.parent = node_a
                node_a.size += node_b.size

In [10]:
from collections import defaultdict

def find_components(line_iterator):
    forest = UnionFind()

    for line in line_iterator:
        forest.union(line[0], line[1])

    result = defaultdict(list)
    for key in forest.keys():
        root = forest.find(key)
        result[root.key].append(key)

    return list(result.values())

In [12]:
graph_edges = [edge for edge in df[["u","v"]].values]

14847753


In [13]:
graph_components = find_components(graph_edges)

In [14]:
print(f'Число компонент слабой связности: {len(graph_components)}')

max_component = max(graph_components, key=lambda i: len(i))
print(f'Число вершин в максимальной компоненте связности: {len(max_component)}')
print(f'Доля вершин в максимальной по мощности компоненте слабой связности: {len(max_component)/vertices}')

Число компонент слабой связности: 28175
Число вершин в максимальной компоненте связности: 3041820
Доля вершин в максимальной по мощности компоненте слабой связности: 0.9800773475643056


## Радиус и диаметр наибольшей компоненты слабой связности

Эксцентриситет вершины - расстояние от нее до самой удаленной.  
Диаметр графа - максимальное расстояние между любыми двумя вершинами, то есть наибольший эксцентриситет.  
Радиус графа - наименьший эксцентриситет.

In [15]:
def binary_search(lys, val):
    first = 0
    last = len(lys)-1
    index = -1
    while (first <= last) and (index == -1):
        mid = (first+last)//2
        if lys[mid] == val:
            index = mid
        else:
            if val<lys[mid]:
                last = mid -1
            else:
                first = mid +1
    if index == -1:
        return False
    return index

test = [1,2,3,4]
if binary_search(test,7):
    print(1)

In [104]:
# test = [edge for edge in df.values]
# print(test[:10])
diameter = 0
radius = float("inf")
# max_component_edges = [edge[2] for edge in df.values[:1000] if edge[0] and edge[1] in max_component]

random_edges_max_component = []

# while len(random_edges_max_component) != 500:
#     random_egde = random.choice(df.values)
#     if random_egde[0] in max_component and random_egde[1] in max_component:
#         random_edges_max_component.append(random_egde)
#     print(len(random_edges_max_component))
i=0
for value in df.values[30000:]:
#     if binary_search(max_component, value[0]) and binary_search(max_component, value[1]):
    if value[0] in max_component and value[1] in max_component:
        print(i)
        i+=1
        diameter = max(diameter, value[3])
        print(f"Diameter: {diameter}")
        print(f"Radius: {radius}")
        radius = min(radius, value[3])
        if i==500:
            break

print(random_edges_max_component)

print(f"Diameter: {diameter}")
print(f"Radius: {radius}")
# max_component_edges = df[(df['u'] in max_component) & (df['v'] in max_component)]
# print(max_component_edges)

0
Diameter: 8.0
Radius: inf
1
Diameter: 8.0
Radius: 8.0
2
Diameter: 8.0
Radius: 4.0
3
Diameter: 8.0
Radius: 4.0
4
Diameter: 8.0
Radius: 2.0
5
Diameter: 8.0
Radius: 2.0
6
Diameter: 8.0
Radius: 0.0
7
Diameter: 9.0
Radius: 0.0
8
Diameter: 9.0
Radius: 0.0
9
Diameter: 9.0
Radius: 0.0
10
Diameter: 9.0
Radius: 0.0
11
Diameter: 9.0
Radius: 0.0
12
Diameter: 9.0
Radius: 0.0
13
Diameter: 9.0
Radius: 0.0
14
Diameter: 9.0
Radius: 0.0
15
Diameter: 9.0
Radius: 0.0
16
Diameter: 9.0
Radius: 0.0
17
Diameter: 9.0
Radius: 0.0
18
Diameter: 9.0
Radius: 0.0
19
Diameter: 9.0
Radius: 0.0
20
Diameter: 9.0
Radius: 0.0
21
Diameter: 9.0
Radius: 0.0
22
Diameter: 9.0
Radius: 0.0
23
Diameter: 9.0
Radius: 0.0
24
Diameter: 9.0
Radius: 0.0
25
Diameter: 9.0
Radius: 0.0
26
Diameter: 9.0
Radius: 0.0
27
Diameter: 9.0
Radius: 0.0
28
Diameter: 9.0
Radius: 0.0
29
Diameter: 9.0
Radius: 0.0
30
Diameter: 9.0
Radius: 0.0
31
Diameter: 9.0
Radius: 0.0
32
Diameter: 9.0
Radius: 0.0
33
Diameter: 9.0
Radius: 0.0
34
Diameter: 9.0
Radius:

KeyboardInterrupt: 

## Число треугольников

In [60]:
graph = nx.Graph()
print(df[["u",'v']].values[:10])
graph.add_edges_from(df[["u",'v']].values)

[[ 3248374 11431799]
 [ 3429130  4795235]
 [  105512  8910840]
 [ 7900466  9739979]
 [ 7296485  9838285]
 [ 9829376 11174212]
 [10013376 14878720]
 [ 6915886 13613865]
 [ 3512046 14855981]
 [ 9697550 12115546]]


In [66]:
graph_test = nx.Graph()
graph_test.add_edges_from(df[["u",'v']].values[:10000])
print(graph_test.degree)

[(3248374, 1), (11431799, 1), (3429130, 1), (4795235, 1), (105512, 1), (8910840, 1), (7900466, 1), (9739979, 1), (7296485, 1), (9838285, 1), (9829376, 1), (11174212, 1), (10013376, 1), (14878720, 1), (6915886, 1), (13613865, 1), (3512046, 1), (14855981, 1), (9697550, 1), (12115546, 1), (9829505, 1), (15571790, 1), (987890, 1), (2402895, 1), (9679853, 1), (13451558, 1), (12719783, 1), (13169396, 1), (10326271, 1), (12398064, 1), (7461148, 1), (13535528, 1), (6337390, 1), (13886495, 1), (2605924, 1), (5199129, 1), (11877890, 1), (15383218, 1), (11047083, 4), (15915295, 1), (5252845, 1), (9348611, 1), (5223469, 1), (9781174, 1), (1846231, 1), (5878280, 1), (63653, 1), (15392514, 1), (6638379, 1), (10519603, 1), (8419757, 1), (14346778, 1), (3632481, 1), (6192749, 1), (1934338, 1), (12066615, 1), (12258303, 2), (13585086, 1), (3863992, 1), (6036165, 1), (3283379, 1), (9299736, 1), (1237243, 2), (13805095, 1), (2595048, 1), (9278718, 1), (4075896, 1), (5522942, 1), (6722433, 1), (13878548, 

In [61]:
from collections import deque, defaultdict
from itertools import chain
from itertools import islice


def enumerate_all_cliques(G):
    """Returns all cliques in an undirected graph.

    This function returns an iterator over cliques, each of which is a
    list of nodes. The iteration is ordered by cardinality of the
    cliques: first all cliques of size one, then all cliques of size
    two, etc.

    Returns
    -------
    iterator
        An iterator over cliques, each of which is a list of nodes in
        `G`. The cliques are ordered according to size.

    Notes
    -----
    To obtain a list of all cliques, use
    `list(enumerate_all_cliques(G))`. However, be aware that in the
    worst-case, the length of this list can be exponential in the number
    of nodes in the graph (for example, when the graph is the complete
    graph). This function avoids storing all cliques in memory by only
    keeping current candidate node lists in memory during its search.
    """
    index = {}
    nbrs = {}
    for u in G:
        index[u] = len(index)
        # Neighbors of u that appear after u in the iteration order of G.
        nbrs[u] = {v for v in G[u] if v not in index}

    queue = deque(([u], sorted(nbrs[u], key=index.__getitem__)) for u in G)
    # Loop invariants:
    # 1. len(base) is nondecreasing.
    # 2. (base + cnbrs) is sorted with respect to the iteration order of G.
    # 3. cnbrs is a set of common neighbors of nodes in base.
    while queue:
        base, cnbrs = map(list, queue.popleft())
        yield base
        for i, u in enumerate(cnbrs):
            # Use generators to reduce memory consumption.
            queue.append(
                (
                    chain(base, [u]),
                    filter(nbrs[u].__contains__, islice(cnbrs, i + 1, None)),
                )
            )
            
def count_k_cliques(G, k):
    k_cliques_count = 0
    for clique in enumerate_all_cliques(G): 
        if len(clique) > k: 
            break
        elif len(clique) == k: 
            k_cliques_count += 1
    return k_cliques_count
        
print(count_k_cliques(graph, 3))

KeyboardInterrupt: 

## Локальный кластерный коэффициент вершины

In [75]:
# vertices_list = set([*df["u"].unique(), *df["v"].unique()])
print()

In [94]:
# vertex_degrees = defaultdict(int)

# for edge in unique_edges:
#     vertex_degrees[edge[0]] = vertex_degrees[edge[0]]+1
#     vertex_degrees[edge[1]] = vertex_degrees[edge[1]]+1


vertex_degrees = defaultdict(list)

for edge in unique_edges:
    vertex_degrees[edge[0]].append(edge[[1]])
    vertex_degrees[edge[1]].append(edge[[0]])


print(f'Степени вершин: {vertex_degrees[11431799]}')


# print(f'Степени вершин: {vertex_degrees}')

Степени вершин: [array([3248374]), array([6532783]), array([11051562]), array([12967294]), array([13894236]), array([14217030]), array([15736225])]


In [103]:
local_coef = defaultdict(int)
print(unique_edges)
for key, value in vertex_degrees.items():
#     print([value[i], value[i+1]])
    for i in range(len(value)-1):
        if [int(value[i]), int(value[i+1])] in unique_edges:
#             print("1111111")
            local_coef[key]+=1

print(local_coef[11431799])

[[       5  2134182]
 [       5  4766806]
 [       5  6862539]
 ...
 [15970352 15975718]
 [15972807 15981116]
 [15980834 15981716]]
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/antonkondrahin/Library/Caches/pypoetry/virtualenvs/finite-graphs-7eI0dcYP-py3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3397, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/sp/t3g53tkd4nnc2w9tm24fpp_r0000gn/T/ipykernel_2165/1480677661.py", line -1, in <cell line: 3>
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonkondrahin/Library/Caches/pypoetry/virtualenvs/finite-graphs-7eI0dcYP-py3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 1992, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/Users/antonkondrahin/Library/Caches/pypoetry/virtualenvs/finite-graphs-7eI0dcYP-py3.10/lib/python3.10/site-packages/IPython/core/ultratb.py", line 1118, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/Users/anton

In [97]:
print(local_coef[11431795])

0
