说明：本项目提交时忘记保留checkpoint

In [1]:
!pip install pgl  

Looking in indexes: https://mirror.baidu.com/pypi/simple/
Collecting pgl
[?25l  Downloading https://mirror.baidu.com/pypi/packages/4f/77/f7da1735b936a9ce1b199d7d0cf00379d8c53f3f6ae7ca93ec585fe2342f/pgl-2.1.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (7.9MB)
[K     |████████████████████████████████| 7.9MB 15.6MB/s eta 0:00:01
Installing collected packages: pgl
Successfully installed pgl-2.1.5


In [2]:
from collections import namedtuple
import numpy as np
import paddle
import paddle.nn as nn
import pandas as pd
import pgl
import tqdm
from paddle.optimizer import Adam
from paddle.optimizer import Momentum
from easydict import EasyDict as edict
import paddle.fluid as F
import paddle.fluid.layers as L

In [3]:
Dataset = namedtuple("Dataset",
                     ["graph", "num_classes", "train_index", "train_label", "valid_index", "valid_label", "test_index", "test_label"])

In [4]:
def load_edges(num_nodes, self_loop=True, add_inverse_edge=True):
    # 从数据中读取边
    edges = pd.read_csv(r"/home/aistudio/data/data61620/edges.csv", header=None,
                        names=["src", "dst"]).values

    if add_inverse_edge:
        edges = np.vstack([edges, edges[:, ::-1]])

    if self_loop:
        src = np.arange(0, num_nodes)
        dst = np.arange(0, num_nodes)
        self_loop = np.vstack([src, dst]).T
        edges = np.vstack([edges, self_loop])

    return edges

In [5]:
def load():
    # 从数据中读取点特征和边，以及数据划分
    node_feat = np.load(r"/home/aistudio/data/data61620/feat.npy")
    num_nodes = node_feat.shape[0]
    edges = load_edges(num_nodes=num_nodes, self_loop=True, add_inverse_edge=True)
    graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges, node_feat={"feat": node_feat})

    in_degree = graph.indegree()
    norm = np.maximum(in_degree.astype("float32"), 1)
    norm = np.power(norm, -0.5)
    graph.node_feat["norm"] = paddle.to_tensor(np.expand_dims(norm, -1))

    df = pd.read_csv(r"/home/aistudio/data/data61620/train.csv")
    node_index = df["nid"].values
    node_label = df["label"].values
    train_part = int(len(node_index) * 0.8)
    train_idx = paddle.to_tensor(node_index[:train_part])
    train_lbl = paddle.to_tensor(np.expand_dims(node_label[:train_part], -1))
    valid_index = paddle.to_tensor(node_index[train_part:])
    valid_label = paddle.to_tensor(np.expand_dims(node_label[train_part:], -1))
    test_idx = paddle.to_tensor(pd.read_csv(r"/home/aistudio/data/data61620/test.csv")["nid"].values)
    test_label = paddle.to_tensor(np.zeros((len(test_idx), 1), dtype="int64"))

    graph_ds = Dataset(graph=graph.tensor(),
                       train_label=train_lbl,
                       train_index=train_idx,
                       valid_index=valid_index,
                       valid_label=valid_label,
                       test_index=test_idx,
                       test_label=test_label,
                       num_classes=35)
    return graph_ds

In [6]:
class GCN(nn.Layer):
    """Implement of GCN
    """

    def __init__(self,
                 input_size,
                 num_class,
                 num_layers=1,
                 hidden_size=64,
                 dropout=0.5,
                 **kwargs):
        super(GCN, self).__init__()
        self.num_class = num_class
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.gcn_s = nn.LayerList()
        for i in range(self.num_layers):
            if i == 0:
                self.gcn_s.append(
                    pgl.nn.GCNConv(
                        input_size,
                        self.hidden_size,
                        activation="relu",
                        norm=True))
            else:
                self.gcn_s.append(
                    pgl.nn.GCNConv(
                        self.hidden_size,
                        self.hidden_size,
                        activation="relu",
                        norm=True))
            self.gcn_s.append(nn.Dropout(self.dropout))
        self.gcn_s.append(pgl.nn.GCNConv(self.hidden_size, self.num_class))

    def forward(self, graph, feature):
        for m in self.gcn_s:
            if isinstance(m, nn.Dropout):
                feature = m(feature)
            else:
                feature = m(graph, feature)
        return feature

In [7]:
class GCNRes(nn.Layer):
    """Implement of GCNRes
    """

    def __init__(self,
                 input_size,
                 num_class,
                 num_layers=1,
                 feat_drop=0.6,
                 hidden_size=64,
                 dropout = 0.5,
                 **kwargs):
        super(GCNRes, self).__init__()
        self.input_size = input_size
        self.num_class = num_class
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.gcns = nn.LayerList()

        self.linear = nn.Linear(input_size, self.hidden_size, name='initialize_feature')

        for i in range(self.num_layers):
            self.gcns.append(
                pgl.nn.GCNConv(self.hidden_size,
                               self.hidden_size, activation=None, norm=False))

        self.gcns.append(
            pgl.nn.GCNConv(self.hidden_size,
                           self.num_class, activation=None, norm=False))

    def forward(self, graph, feature):
        # 先来个全连接层
        feature = self.linear(feature)
        feature = paddle.nn.functional.layer_norm(feature, self.hidden_size)
        feature = paddle.nn.functional.dropout(feature, self.dropout)

        for x in range(self.num_layers):
            res_feature = feature
            feature = self.gcns[x](graph, feature)
            feature = res_feature + feature
            feature = paddle.nn.functional.relu(feature)
            feature = paddle.nn.functional.layer_norm(feature, normalized_shape=self.hidden_size)
        feature = self.gcns[-1](graph, feature)
        feature = paddle.nn.functional.relu(feature)
        feature = paddle.nn.functional.layer_norm(feature, normalized_shape=self.num_class)
        
        return feature

In [11]:
accuracy = paddle.metric.Accuracy()

In [12]:
def train(node_index, node_label, gnn_model, graph, criterion, opt):
    gnn_model.train()
    pred = gnn_model(graph, graph.node_feat["feat"])
    pred = paddle.gather(pred, node_index)
    loss = criterion(pred, node_label)
    loss.backward()
    #acc = accuracy.compute(pred=pred, label=node_label)
    acc = paddle.metric.accuracy(input=pred, label=node_label, k=1)
    opt.minimize(loss)
    #opt.step()
    opt.clear_grad()
    return loss, acc

In [13]:
@paddle.no_grad()
def eval(node_index, node_label, gnn_model, graph, criterion):
    gnn_model.eval()
    pred = gnn_model(graph, graph.node_feat["feat"])
    pred = paddle.gather(pred, node_index)
    loss = criterion(pred, node_label)
    #acc = accuracy.compute(pred=pred, label=node_label)
    acc = paddle.metric.accuracy(input=pred, label=node_label, k=1)
    pred = np.argmax(pred, axis=1)
    return loss, acc, pred

In [14]:
def entrance(dataset, cfg):
    graph = dataset.graph
    train_index_ = dataset.train_index
    train_label_ = dataset.train_label

    val_index_ = dataset.valid_index
    val_label_ = dataset.valid_label

    test_index_ = dataset.test_index
    test_label = dataset.test_label

    graph_model = GCNRes
    criterion = paddle.nn.loss.CrossEntropyLoss()

    dur = []
    
    best_test_labels = []

    for run in range(20):
        cal_val_acc = []
        cal_test_acc = []
        cal_val_loss = []
        cal_test_loss = []
        test_labels = []

        gnn_model = graph_model(
            input_size=graph.node_feat["feat"].shape[1],
            num_class=dataset.num_classes,
            **cfg)

        optimizer = Adam(
            learning_rate=cfg.learning_rate,
            parameters=gnn_model.parameters(),
            weight_decay=cfg.weight_decay)

        for epoch in range(1001):
            train_loss, train_acc = train(train_index_, train_label_, gnn_model,
                                          graph, criterion, optimizer)

            if(epoch % 10==0):
                print('%-9s%-10s%-20s%-20s' %('Runs {0}-'.format(run), 'Epochs {0}'.format(epoch), '-train loss: {0}'.format(train_loss.numpy()), '-train acc: {0}'.format(train_acc.numpy())))
                val_loss, val_acc, val_pred = eval(val_index_, val_label_, gnn_model, graph, criterion)
                print('%-9s%-10s%-20s%-20s' %('Runs {0}-'.format(run), 'Epochs {0}'.format(epoch), '-val   loss: {0}'.format(val_loss.numpy()), '-val   acc: {0}'.format(val_acc.numpy())))
                cal_val_acc.append(val_acc.numpy())
                cal_val_loss.append(val_loss.numpy())
                
                test_loss, test_acc, test_pred = eval(test_index_, test_label, gnn_model,
                                       graph, criterion)
                cal_test_acc.append(test_acc.numpy())
                cal_test_loss.append(test_loss.numpy())
                test_labels.append(test_pred)

        test_prediction = test_labels[np.argmax(cal_val_acc)]
        submission = pd.DataFrame(data={"nid": test_index_,"label": test_prediction})
        submission.to_csv("2021-8-31/submission-{0}.csv".format(cal_val_acc[np.argmax(cal_val_acc)]), index=False)

In [15]:
gcn_config = {
    "model_name": "GCNRes",
    "num_layers": 3,
    "dropout": 0.15,
    'hidden_size': 128,
    "learning_rate": 0.00125,
    "weight_decay": 0.0005,
    "edge_dropout": 0.00,
}

In [17]:
graph_dataset = load()

In [18]:
entrance(graph_dataset, edict(gcn_config))

Runs 19- Epochs 1000-val   loss: [1.0094459]-val   acc: [0.7304051]

In [21]:
import csv
from collections import Counter

def vote_merge(pth, filelst):
    result = {}
    fw = open('submission.csv', encoding='utf-8', mode='w', newline='')
    csv_writer = csv.writer(fw)
    csv_writer.writerow(['nid', 'label'])
    for filepath in filelst:
        cr = open(pth+filepath, encoding='utf-8', mode='r')
        csv_reader = csv.reader(cr)
        for i, row in enumerate(csv_reader):
            if i == 0:
                continue
            idx, cls = row
            if idx not in result:
                result[idx] = []
            result[idx].append(cls)

    for nid, clss in result.items():
        counter = Counter(clss)
        true_cls = counter.most_common(1)
        csv_writer.writerow([nid, true_cls[0][0]])
    print('write csv done')

In [22]:
import os
import numpy as np
from scipy import stats
import pandas as pd
#path放的是你所有的提交文件
path = '/home/aistudio/2021-8-31'

filelist = []
for root, dirs, files in os.walk(path):
    for f in files:
        if f.endswith('csv'):
            filelist.append(f)
                
print(filelist)

# 下面这行代码按照测试精度进行排序
filelist = sorted(filelist, key= lambda x:float(x[12:-5]), reverse=True)
print(filelist)
filelist = filelist[:10]
print(filelist)

vote_merge('/home/aistudio/2021-8-31/', filelist)



['submission-[0.7366698].csv', 'submission-[0.7330391].csv', 'submission-[0.7385207].csv', 'submission-[0.7380223].csv', 'submission-[0.736029].csv', 'submission-[0.7365274].csv', 'submission-[0.7376664].csv', 'submission-[0.7363138].csv', 'submission-[0.73659855].csv', 'submission-[0.73553073].csv', 'submission-[0.733751].csv', 'submission-[0.7343205].csv', 'submission-[0.7333238].csv', 'submission-[0.7351748].csv', 'submission-[0.7349612].csv', 'submission-[0.736385].csv', 'submission-[0.73645616].csv', 'submission-[0.7371681].csv', 'submission-[0.73859185].csv']
['submission-[0.73859185].csv', 'submission-[0.7385207].csv', 'submission-[0.7380223].csv', 'submission-[0.7376664].csv', 'submission-[0.7371681].csv', 'submission-[0.7366698].csv', 'submission-[0.73659855].csv', 'submission-[0.7365274].csv', 'submission-[0.73645616].csv', 'submission-[0.736385].csv', 'submission-[0.7363138].csv', 'submission-[0.736029].csv', 'submission-[0.73553073].csv', 'submission-[0.7351748].csv', 'subm