In [42]:
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np
import random
from tqdm import tqdm

def perform_random_walks(graph, num_walks, walk_length, seed=20):
    random.seed(seed)
    walks = []
    
    for _ in tqdm(range(num_walks), desc="Performing walks", unit="walk"):
        for starting_node in graph.nodes():
            walk = [starting_node]
            while len(walk) < walk_length:
                current_node = walk[-1]
                neighbors = list(graph.neighbors(current_node))
                if not neighbors:
                    break
                
                weights = [graph[current_node][neighbor]['weight'] for neighbor in neighbors]
                total_weight = sum(weights)
                probabilities = [weight / total_weight for weight in weights]
                
                next_node = np.random.choice(neighbors, p=probabilities)
                walk.append(next_node)
            walks.append(walk)
    
    return walks

def analyze_communities(seed):
    walks = perform_random_walks(G, num_walks=8192, walk_length=12, seed=seed)
    walks = [[str(node) for node in walk] for walk in walks]
    model = Word2Vec(sentences=walks, vector_size=128, window=8, min_count=1, sg=1, workers=12, epochs=16)
    node_embeddings = np.array([model.wv[str(node)] for node in G.nodes()])

    kmeans = KMeans(n_clusters=10, random_state=seed)
    communities = kmeans.fit_predict(node_embeddings)

    community_dict = {}
    for node, community in zip(G.nodes(), communities):
        node_name = code_to_name.get(int(node), "Unknown")
        if community not in community_dict:
            community_dict[community] = []
        community_dict[community].append(node_name)

    with open(f"community_summary_seed_{seed}.txt", "w", encoding='EUC-KR') as file:
        for community, names in community_dict.items():
            file.write(f"Community {community}: {names}\n")

    with open(f"community_mapping_seed_{seed}.txt", "w", encoding='EUC-KR') as file:
        for node, community in zip(G.nodes(), communities):
            file.write(f"{node} : {community}\n")

data_df = pd.read_csv('all_collected_data.csv', encoding="EUC-KR")
edges = data_df[["출발 행정동 코드", "도착 행정동 코드", "이동인구(합)"]]

G = nx.Graph()
for index, row in edges.iterrows():
    if pd.notna(row["이동인구(합)"]):
        G.add_edge(
            int(row["출발 행정동 코드"]), int(row["도착 행정동 코드"]), weight=float(row["이동인구(합)"])
        )

df = pd.read_excel('data/서울생활이동데이터_자치구코드_20210907.xlsx')
code_to_name = df.set_index('시군구')['name'].to_dict()

for seed in [400, 500, 600]:
    analyze_communities(seed)


Performing walks: 100%|██████████| 8192/8192 [05:49<00:00, 23.43walk/s]
  super()._check_params_vs_input(X, default_n_init=10)
Performing walks: 100%|██████████| 8192/8192 [05:10<00:00, 26.39walk/s]
  super()._check_params_vs_input(X, default_n_init=10)
Performing walks: 100%|██████████| 8192/8192 [05:10<00:00, 26.35walk/s]
  super()._check_params_vs_input(X, default_n_init=10)
