### 将不同数据集的数据统一转化为gml格式，用于输入今vasn，fm3等模型当中

In [9]:
import os
import csv
import networkx as nx
from networkx.utils import is_string_like

In [5]:
def generate_gml(G):
    # gml图生成器直接将networkx源代码进行修改
    # recursively make dicts into gml brackets
    def listify(d,indent,indentlevel):
        result='[ \n'
        for k,v in d.items():
            if type(v)==dict:
                v=listify(v,indent,indentlevel+1)
            result += (indentlevel+1)*indent +                 string_item(k,v,indentlevel*indent)+'\n'
        return result+indentlevel*indent+"]"

    def string_item(k,v,indent):
        # try to make a string of the data
        if type(v)==dict: 
            v=listify(v,indent,2)
        elif is_string_like(v):
            v='"%s"'%v
        elif type(v)==bool:
            v=int(v)
        return "%s %s"%(k,v)

    # check for attributes or assign empty dict
    if hasattr(G,'graph_attr'):
        graph_attr=G.graph_attr
    else:
        graph_attr={}
    if hasattr(G,'node_attr'):
        node_attr=G.node_attr
    else:
        node_attr={}

    indent=2*' '
    count=iter(range(len(G)))
    node_id={}

    yield "graph ["
    if G.is_directed():
        yield indent+"directed 1"
    # write graph attributes 
    for k,v in G.graph.items():
        if k == 'directed':
            continue
        yield indent+string_item(k,v,indent)
    # write nodes
    for n in G:
        yield indent+"node ["
        # get id or assign number
        #nid=G.node[n].get('id',next(count))
        #node_id[n]=nid
        nid = n
        node_id[n]=n
        # 上两行对原代码进行修改，以原始输入的id作为输出图文件的id
        yield 2*indent+"id %s"%nid
        label=G.node[n].get('L', str(nid))
#         node_json = G.node[n]['JSON']
        if is_string_like(label):
            label='"%s"'%label
        yield 2*indent+'label %s'%label
#         yield 2*indent+'json %s'%node_json
        if n in G:
          for k,v in G.node[n].items():
              if k=='id' or k == 'label' or k == 'L' or k == 'JSON': continue
              yield 2*indent+string_item(k,v,indent)
        yield indent+"]"
    # write edges
    for u,v,edgedata in G.edges(data=True):
#         source_color = G.node[u]['graphics']['fill']
#         target_color = G.node[v]['graphics']['fill']
        yield indent+"edge ["
        yield 2*indent+"source %s"%u
        yield 2*indent+"target %s"%v
#         yield 2*indent+"value 1.0"
#         # yield 2*indent+"color "+ get_edge_color_by_mixe_node_color(source_color, target_color)
#         yield 2*indent+"color #000000"
#         yield 2*indent+"path "+edge2path[str(u)+'|'+str(v)]
        yield indent+"]"
    yield "]"

In [6]:
def preprocessing_Slashdot0811():
    # 用networkx将txt文件转换为gml文件
    G = nx.Graph()
    i = 0
    edge_set = set()
    with open('../datasets/Slashdot0811.txt', 'r') as fp:
        datalines = fp.readlines()
        for line in datalines:
            i += 1
            from_node, to_node = line.replace('\n', '').split('\t')
            if from_node == to_node:
                # 去掉自环
                continue
            if f'{from_node}\t{to_node}' not in edge_set and f'{to_node}\t{from_node}' not in edge_set:
                # 去除双向连边中的一个，即0->1和1->0中的一个
                edge_set.add(line)
    for edge in edge_set:
        from_node, to_node = edge.split('\t')
        G.add_edge(int(from_node), int(to_node))
    if not os.path.exists('../temp/Slashdot0811/preprocessed_gml'): os.makedirs('../temp/Slashdot0811/preprocessed_gml')
    with open('../temp/Slashdot0811/preprocessed_gml/graph.gml', 'w') as fp:
        for line in generate_gml(G):
            line+='\n'
            fp.write(line)

In [7]:
def preprocessing_email_Eu_core():
    G = nx.Graph()
    i = 0
    edge_set = set()
    with open('../datasets/email-Eu-core.txt', 'r') as fp:
        datalines = fp.readlines()
        for line in datalines:
            i += 1
            from_node, to_node = line.replace('\n', '').split(' ')
            if from_node == to_node:
                # 去掉自环
                continue
            if f'{from_node} {to_node}' not in edge_set and f'{to_node} {from_node}' not in edge_set:
                # 去除双向连边中的一个，即0->1和1->0中的一个
                edge_set.add(line)
    for edge in edge_set:
        from_node, to_node = edge.split(' ')
        G.add_edge(int(from_node), int(to_node))
    if not os.path.exists('../temp/email_Eu_core/preprocessed_gml'): os.makedirs('../temp/email_Eu_core/preprocessed_gml')
    with open('../temp/email_Eu_core/preprocessed_gml/graph.gml', 'w') as fp:
        for line in generate_gml(G):
            line+='\n'
            fp.write(line)

In [12]:
def preprocessing_Nature():
    G = nx.Graph()

In [10]:
def preprocessing_lasftm_asia():
    G = nx.Graph()
    i = 0
    edge_set = set()
    with open('../datasets/lasftm_asia/lastfm_asia_edges.csv', 'r') as fp:
        csv_reader = csv.reader(fp)
        header = next(csv_reader)
        for line in csv_reader:
            i += 1
            from_node, to_node = line[0], line[1]
            if from_node == to_node:
                # 去掉自环
                continue
            if f'{from_node}|{to_node}' not in edge_set and f'{to_node}|{from_node}' not in edge_set:
                # 去除双向连边中的一个，即0->1和1->0中的一个
                edge_set.add(f'{from_node}|{to_node}')
    for edge in edge_set:
        from_node, to_node = edge.split('|')
        G.add_edge(int(from_node), int(to_node))
    if not os.path.exists('../temp/lasftm_asia/preprocessed_gml'): os.makedirs('../temp/lasftm_asia/preprocessed_gml')
    with open('../temp/lasftm_asia/preprocessed_gml/graph.gml', 'w') as fp:
        for line in generate_gml(G):
            line+='\n'
            fp.write(line)

In [None]:
def preprocessing_git_web_ml():
    G = nx.Graph()
    i = 0
    edge_set = set()
    with open('../datasets/git_web_ml/musae_git_edges.csv', 'r') as fp:
        csv_reader = csv.reader(fp)
        header = next(csv_reader)
        for line in csv_reader:
            i += 1
            from_node, to_node = line[0], line[1]
            if from_node == to_node:
                # 去掉自环
                continue
            if f'{from_node}|{to_node}' not in edge_set and f'{to_node}|{from_node}' not in edge_set:
                # 去除双向连边中的一个，即0->1和1->0中的一个
                edge_set.add(f'{from_node}|{to_node}')
    for edge in edge_set:
        from_node, to_node = edge.split('|')
        G.add_edge(int(from_node), int(to_node))
    if not os.path.exists('../temp/git_web_ml/preprocessed_gml'): os.makedirs('../temp/git_web_ml/preprocessed_gml')
    with open('../temp/git_web_ml/preprocessed_gml/graph.gml', 'w') as fp:
        for line in generate_gml(G):
            line+='\n'
            fp.write(line)

In [11]:
if __name__ == "__main__":
    preprocessing_lasftm_asia()