## Pokemon 데이터 분석

In [None]:
# !pip install decorator==4.3

In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('Pokemon.csv')
data.head()

In [None]:
# Netwrok analysis
g = nx.Graph()
g = nx.from_pandas_edgelist(data, source = 'Name', target = 'Type 1')
print(nx.info(g))

In [None]:
plt.figure(figsize = (30, 30))
pos = nx.spring_layout(g, k = 0.15)
nx.draw_networkx(g, pos, node_size = 30, node_color = 'blue')
plt.show()

[참고] spring_layout 함수 (reference -> https://networkx.org/documentation/stable/reference/generated/networkx.drawing.layot.spring_layout.html
(https://networkx.org/documentation/stable/reference/generated/networkx.drawing.layout.spring_layout.html))
- 노드의 위치를 계산하기 위해 노드와 에지를 고려
- 단점은 계산 비용이 높음 (속도가 느림)

## 왕좌의 게임 등장인물 관계망 분석

In [None]:
df = pd.read_csv('asoiaf-all-edges.csv')
df

In [None]:
G = nx.from_pandas_edgelist(df, source = 'Source', target = 'Target')

print(len(G.nodes())) # 797
print(len(G.edges())) # 2824

In [None]:
deg_cen = nx.degree_centrality(G)

In [None]:
deg_cen

In [None]:
sorted(deg_cen.items(), key = lambda x : x[1], reverse = True)[0:5]

In [None]:
page_rank = nx.pagerank(G)
sorted(page_rank.items(), key = lambda x : x[1], reverse = True)[0:5]

In [None]:
clos_cen = nx.closeness_centrality(G)
sorted(clos_cen.items(), key = lambda x : x[1], reverse = True)[0:5]

In [None]:
bet_cen = nx.betweenness_centrality(G)
sorted(bet_cen.items(), key = lambda x : x[1], reverse = True)[0:5]

## facebook network analysis

- data download --> Stanford Large Network Dataset Collection (http://snap.stanford.edu/data/index.html  (http://snap.stanford.edu/data/index.html))

In [None]:
facebook = pd.read_csv('facebook_combined.txt.gz', compression = 'gzip', names = ['start_node', 'end_node'])
facebook

## 그래프 정의

In [None]:
G = nx.from_pandas_edgelist(facebook, 'start_node', 'end_node')

## 그래프 시각화

In [None]:
pos = nx.spring_layout(G, iterations = 15, seed = 1723)
fig, ax = plt.subplots(figsize = (15, 9))
ax.axis('off')
plot_options = {'node_size' : 10, 'with_labels' : False, 'width' : 0.15}
nx.draw_networkx(G, pos = pos, ax = ax, **plot_options)

## 그래프 형상(toplogical attributes) 분석

In [None]:
print('Nodes : ', G.number_of_nodes())

In [None]:
print('Edges : ', G.number_of_edges())

In [None]:
# maximum shortest paths --> diameter

shortest_path_lengths = dict(nx.all_pairs_shortest_path_length(G))
diameter = max(nx.eccentricity(G, sp = shortest_path_lengths).values())
diameter

In [None]:
# Compute the avearge shortest path length for each node

average_path_lengths = [
    np.mean(list(spl.values())) for spl in shortest_path_lengths.values()
]

# The avearge over all nodes
np.mean(avearge_path_lengths)

In [None]:
# create an array to store values from 0 up to (and including) diameter
path_lengths = np.zeros(diameter + 1, dtype = int)
path_lengths

In [None]:
# Extract the frequency of shortest path lengths between two nodes
for pls in shortest_path_lengths.values():
    pl, cnts = np.unique(list(pls.values()), return_counts = True)
    path_lengths[pl] += cnts
    
# Express frequency distribution as a percentage (ignoring path lengths of 0)
freq_percent = 100 * path_lengths[1:] / path_lengths[1:].sum()

# Plot the frequency distribution (ignoring path lengths of 0) as a percentage
fig, ax = plt.subplots(figsize = (15, 8))
ax.bar(np.arange(1, diameter +1), height = freq_percent)
ax.set_title('Distribution of shortest path length in G', fontdict = {'size':35}, loc = 'center')
ax.set_xlabel('Shortest Path Length', fontdict = {'size':22})
ax.set_ylabel('Frequency (%)', fontdict = {'size':22})

## Centrality measures

In [None]:
deg_cen = nx.degree_centrality(G)
sorted(deg_cen.items(), key = lambda x : x[1], reverse = True)[0:10]

In [None]:
bet_cen = nx.betweenness_centrality(G)\
sorted(bet_cen.items(), key = lambda x : x[1], reverse = True)[0:10]

- 0, 107, 1684, 1912, 3437 노드는 높은 연결중심성과 매개중심성을 가지는 'influence' 노드로 볼 수 있음
- 567, 1085, 698 노드는 매개중심성은 높지만 연결중심성은 높지 않음. 네트워크에서 이들 노드는 'popular users'는 아니지만정보의 전달에 있어서 중요 노드로 볼 수 있음

In [None]:
clos_cen = nx.closeness_centrality(G)
sorted(clos_cen.items(), key = lambda x : x[1], reverse = True)[0:10]

In [None]:
eigen_cen = nx.eigenvector_centrality(G)
sorted(eigen_cen.items(), key = lambda x : x[1], reverse = True)[0:10]

- 1912 노드는 높은 고유벡터중심성을 가짐. 네트워크에 이 노드가 전반적인 영향력 면에서 매우 중요한 노드. 연결중심성, 매개중심성이 모두 높은 노드로, 'popular & influencious'
- 나머지 노드들은 다른 중심성 지표는 높지 않으나 고유벡터중심성은 높음. 그 이유로 이들 노드들이 모두 1912 노드와 연결되어있을 가능성이 있음

참고
- https://anweh.tistory.com/33 (https://anweh.tistory.com/33)
- https://networkx.org/nx-guides/content/exploratory_notebooks/facebook_notebook.html (https://networkx.org/nxguides/content/exploratory_notebooks/facebook_notebook.html)