In [1]:
import os
import numpy as np

## 以下のような処理を行う関数をfunction.pyに作成した。

|関数名|処理|
|---|:---|
|make_data|{"graph:\[n_graph.txt\], "label":\[n_label.txt\]}の辞書を引数にとり、それに対応した(次元数、隣接行列、ラベル)のタプルを内包したリストを返す。file_name=Trueでgraph.txtとlabel.txtのファイル名のタプルを加えて返す|
|train_data_loader|./datasets/train内のデータからvalidation用のデータとtrain用のデータをとり、そのデータをmake_dataを使って使用できるデータ型にする|
|test_data_loader|./datasets/test内のデータから(次元数、隣接行列、ファイル名)のタプルを内包したリストを返す|

### make_dataについて

In [2]:
def make_data(data_set, file_name=False):
    data=[]
    for k,v in data_set.items():
        with open(k) as graph, open(v) as label:
            graph_lines=graph.readlines()
            label_line=label.read().rstrip("\n")
            D=int(graph_lines[0].rstrip("\n"))
            arr=[]
            for graph_line in graph_lines[1:]:
                graph_line=graph_line.rstrip("\n")
                arr.append(list(map(int,graph_line.split())))
            arr=np.array(arr)
            if file_name:
                tmp_set=(D, arr, int(label_line), (k,v))
            else:
                tmp_set=(D, arr, int(label_line))
            data.append(tmp_set)
    return data

#### make_dataのテスト

In [3]:
#nums内の番号のファイルを読み込んでみる
nums=np.array([0, 1, 10, 100, 1000])
data_set={}
for num in nums:
    graph_file=str(num)+"_graph.txt"
    label_file=str(num)+"_label.txt"
    data_set[graph_file]=label_file

print(data_set)

{'1_graph.txt': '1_label.txt', '0_graph.txt': '0_label.txt', '100_graph.txt': '100_label.txt', '1000_graph.txt': '1000_label.txt', '10_graph.txt': '10_label.txt'}


In [4]:
chdir=os.getcwd()
os.chdir(os.path.join(chdir,"datasets","train"))
data=make_data(data_set, file_name=True)
os.chdir(chdir)
print(data)

[(13, array([[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 0, ('1_graph.txt', '1_label.txt')), (11, array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0

### train_data_loaderについて

In [5]:
def train_data_loader(path=os.getcwd(), file_name=False):
    chdir=os.getcwd()
    path=os.path.join(path, "datasets", "train")
    os.chdir(path)
    graphs=[file for file in os.listdir('.') if file.endswith('graph.txt')]
    labels=[file for file in os.listdir('.') if file.endswith('label.txt')]
    dataset=dict(zip(sorted(graphs),sorted(labels)))

    valid_num=list(np.random.choice(np.arange(2000),400,replace=False))

    train_set, valid_set={}, {}
    for k,v in dataset.items():
        if int(k.split("_")[0]) in valid_num:
            valid_set[k]=v
        else:
            train_set[k]=v
    train_data=make_data(train_set, file_name=file_name)
    valid_data=make_data(valid_set, file_name=file_name)
    os.chdir(chdir)
    
    return train_data, valid_data

#### train_data_loaderのテスト

In [8]:
train_data, valid_data=train_data_loader(file_name=True)

In [15]:
#一応作成したファイル間で重複がないかを確認
t_file,v_file=[],[]
for data in train_data:
    t_file.append(data[3])
for data in valid_data:
    v_file.append(data[3])

print(set(t_file)&set(v_file))

set()


### test_data_loaderについて

In [16]:
def test_data_loader(path=os.getcwd()):
    chdir=os.getcwd()
    test_data=[]
    path=os.path.join(path, "datasets", "test")
    os.chdir(path)
    graphs=[file for file in os.listdir('.') if file.endswith('graph.txt')]
    for file in graphs:
        with open(file) as graph:
            graph_lines=graph.readlines()
            D=int(graph_lines[0].rstrip("\n"))
            arr=[]
            for graph_line in graph_lines[1:]:
                graph_line=graph_line.rstrip("\n")
                arr.append(list(map(int, graph_line.split())))
            arr=np.array(arr)
            tmp_set=(D, arr, file)
            test_data.append(tmp_set)
    os.chdir(chdir)
    return test_data

In [17]:
test_data=test_data_loader()

In [18]:
test_data

[(10, array([[0, 0, 1, 1, 0, 0, 0, 0, 1, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 1, 0, 1, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 1, 0, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
         [0, 0, 1, 1, 0, 0, 0, 0, 1, 0],
         [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
         [0, 0, 0, 0, 1, 1, 0, 0, 0, 0]]), '121_graph.txt'),
 (14, array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1],
         [0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1],
         [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1