In [2]:
import sys
sys.path.append(r"/home/kayzhou/zhangyue/text/text_UNION/text_GCN")

from model import GraphConvolution, GCNModel
from utils import *

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pickle as pkl
import os
import sys
import time
import argparse



class Trainer:
    def __init__(self, args):

        print(args)
        
        # Define Parameters
        self.DATASET = args['dataset']
        self.NB_EPOCH = args['epochs']
        self.LR = args['learnrate']
        self.L2 = args['l2norm']
        self.HIDDEN = args['hidden']
        self.BASES = args['bases']
        self.use_cuda = False   
        self.PRED_TYPE = args['pred_type']   
        self.USE_BIAS = False
        self.MODEL_NAME = '%s_%s' % (self.DATASET,self.PRED_TYPE) 
        self.num_features = 16
        self.log_step = 1
        

        # Load Data
        #去掉文件名返回目录
        dirname = "/home/kayzhou/zhangyue/text/data_GCN/data_processed/"
        #utils load_data
        raw_data, data = load_data(args, dirname, self.use_cuda)
        A = raw_data['A']
        y = raw_data['y']
        #Series.to_dense()函数已返回给定系列对象的密集表示形式。
        #它分配了内存以存储系列中的缺失值。当缺少大量数据时，密集表示对内存的效率不高。
        y = np.array(y)
        self.labels = y

        self.idx_train = data['idx_train'] 
        self.idx_valid = data['idx_valid']  
        self.idx_test= data['idx_test'] 
        self.idx_train_set = data['idx_train_set'] 
        self.idx_test_set = data['idx_test_set'] 
        self.inputs = data['inputs']
        self.labels_train = data['labels_train'] 
        self.labels_valid = data['labels_valid']  
        self.labels_test = data['labels_test']  
        self.num_nodes = data['num_nodes']  
        self.num_docs = data['num_docs']  
        self.num_non_docs= data['num_non_docs']
        self.node2adj = data['node2adj']
        self.support = data['support']
        self.idx_train_pre = (np.array(self.idx_train)+self.num_non_docs).tolist()
        self.idx_valid_pre = (np.array(self.idx_valid)+self.num_non_docs).tolist()
        self.idx_test_pre = (np.array(self.idx_test)+self.num_non_docs).tolist()


        print('Data loaded successfully!')

        # Compile Model
        #定义模型的基本参数
        self.model = GCNModel(data, self.num_features, self.num_nodes, self.HIDDEN, self.support, self.BASES, 2, 
             bias_feature=self.USE_BIAS)
        if (self.use_cuda):
            self.model = self.model.cuda()

        parameters = [p for p in list(self.model.parameters())
            if p.requires_grad]
        # for p in parameters:
        #     print(p.size())
        self.optimizer = optim.Adam(parameters,
                            lr=self.LR, weight_decay=self.L2)

        #lstm为sum
        self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='sum')

    
    def _train(self):
        # Fit Model
        best_acc, best_test_acc = 0, 0
        best_loss = 10000000
        best_train_loss = 1000000
        best_epoch = 0
        loss_list = []
        train_acc_list = []
        test_acc_list = []
        dev_acc_list = []
        for epoch in range(0, self.NB_EPOCH):
            # Log wall-clock time
            t = time.time()

            self.model.zero_grad()

            # Single training iteration
            embeds_0 = self.inputs[0] 
            embeds_1 = self.model.gc1([embeds_0] + self.inputs[1:])
            embeds_2 = self.model.gc2([embeds_1] + self.inputs[1:])
            embeds_final = embeds_2
            scores = self.model.clf_bias(embeds_2)
            
            
            #只用文本节点预测的loss
            loss_train = self.cross_entropy_loss(scores[self.idx_train_pre], self.labels_train) 

            preds = torch.argmax(scores, dim=1)
            correct_train = torch.sum(preds[self.idx_train_pre] == self.labels_train)
            correct_valid = torch.sum(preds[self.idx_valid_pre] == self.labels_valid)
            correct_test = torch.sum(preds[self.idx_test_pre] == self.labels_test)
            #计算准确率
            train_acc = correct_train.item()/self.labels_train.size(0)
            valid_acc = correct_valid.item()/self.labels_valid.size(0)
            test_acc = correct_test.item()/self.labels_test.size(0)
            
            yuce_tag = preds[self.idx_test_pre] 

            print("Epoch: {:04d}".format(epoch),
                      "pred_type:Graph",
                      "train_loss= {:.4f}".format(loss_train),
                      "train_acc= {:.4f}".format(train_acc),
                      "val_acc= {:.4f}".format(valid_acc),
                      "test_acc= {:.4f}".format(test_acc),
                      "time= {:.4f}".format(time.time() - t))  
              
            
            if self.PRED_TYPE == "shareu" or self.PRED_TYPE == "netshareu":
                scores_shareu = []
                scores_value = scores.data.cpu().numpy()
                for node in range(self.num_nodes):
                    scores_t = np.zeros(2) 

                      #self.node2adj为doc节点 to share用户节点
                    if (node not in self.node2adj):
                        scores_shareu.append(scores_t)
                        continue

                    #分享文档的用户集合
                    adj = list(self.node2adj[node])
                    adj_coef = [1/len(adj)] * len(adj)

                      #根据加权分享该文档的用户节点的预测得到文档预测
                    for user, coef in zip(adj, adj_coef):
                        scores_t += coef * scores_value[user]
                    scores_shareu.append(scores_t) 
                  
                scores_shareu = torch.FloatTensor(np.array(scores_shareu))
              #scores_shareu = torch.FloatTensor(np.array(scores_shareu)).cuda()
                loss_train = self.cross_entropy_loss(scores_shareu[self.idx_train_pre], self.labels_train) 

                preds_shareu = torch.argmax(scores_shareu, dim=1)
                correct_train_shareu = torch.sum(preds_shareu[self.idx_train_pre] == self.labels_train)
                correct_valid_shareu = torch.sum(preds_shareu[self.idx_valid_pre] == self.labels_valid)
                correct_test_shareu = torch.sum(preds_shareu[self.idx_test_pre] == self.labels_test)
                train_acc = correct_train_shareu.item()/self.labels_train.size(0)
                valid_acc = correct_valid_shareu.item()/self.labels_valid.size(0)
                test_acc = correct_test_shareu.item()/self.labels_test.size(0)
                print("Epoch: {:04d}".format(epoch),
                    "pred_type:User",
                      "train_loss= {:.4f}".format(loss_train),
                      "train_acc= {:.4f}".format(train_acc),
                      "val_acc= {:.4f}".format(valid_acc),
                      "test_acc= {:.4f}".format(test_acc),
                      "time= {:.4f}".format(time.time() - t))
              
                if self.PRED_TYPE == "netshareu":
                    scores_shareu += scores
                    loss_train = self.cross_entropy_loss(scores_shareu[self.idx_train_pre], self.labels_train) 
                    preds_netshareu = torch.argmax(scores_shareu, dim=1)

                    correct_train_shareu = torch.sum(preds_netshareu[self.idx_train_pre] == self.labels_train)
                    correct_valid_shareu = torch.sum(preds_netshareu[self.idx_valid_pre] == self.labels_valid)
                    correct_test = torch.sum(preds_netshareu[self.idx_test_pre] == self.labels_test)
                    train_acc = correct_train_shareu.item()/self.labels_train.size(0)
                    valid_acc = correct_valid_shareu.item()/self.labels_valid.size(0)
                    test_acc = correct_test_shareu.item()/self.labels_test.size(0)
                    print("Epoch: {:04d}".format(epoch),
                          "pred_type:Graph+User",
                            "train_loss= {:.4f}".format(loss_train),
                            "train_acc= {:.4f}".format(train_acc),
                            "val_acc= {:.4f}".format(valid_acc),
                            "test_acc= {:.4f}".format(test_acc),
                            "time= {:.4f}".format(time.time() - t))


            loss_train.requires_grad_(True)
            loss_train.backward()
            self.optimizer.step()

            loss_list.append(loss_train)
            train_acc_list.append(train_acc)
            dev_acc_list.append(valid_acc)
            test_acc_list.append(test_acc)

            #最好的结果仍旧是准确率最高的结果
            if (test_acc  > best_acc):
                best_acc = test_acc 
                best_acc_epoch = epoch
                best_epoch = epoch
                best_yuce_tag = yuce_tag
                torch.save(self.model.state_dict(), '/home/kayzhou/zhangyue/text/text_UNION/text_GCN/saved_models/%s' % self.MODEL_NAME)
            
            
            #用于判断是否结束训练
            # if loss_train < best_train_loss:
            #     best_train_loss = loss_train
            #     best_epoch = epoch

            if epoch-best_epoch > 10:
                print("Too long time not up")
                break
        
        # plt.figure(figsize=(20,8),dpi=200)
        # plt.plot(range(1,len(loss_list)+1),loss_list,
        #   marker = "o") 
        # plt.xlabel('epoch')
        # plt.ylabel('GCN_loss')
        # plt.savefig("/content/drive/MyDrive/exp_GCN/图神经网络/tupian.png",dpi=200)
        # plt.show()
        
        # plt.figure(figsize=(20,8),dpi=200)
        # plt.plot(range(1,len(loss_list)+1),train_acc_list,marker = "o",label = "train accuracy") 
        # plt.plot(range(1,len(loss_list)+1),dev_acc_list,marker = "o",label = "dev accuracy")
        # plt.plot(range(1,len(loss_list)+1),test_acc_list,marker = "o",label = "test accuracy")
        # plt.xlabel('epoch')
        # plt.ylabel('accuracy')
        # plt.savefig("/content/drive/MyDrive/exp_GCN/图神经网络/acc.png",dpi=200)
        # plt.show()
    

        print("best_acc_result:")
        print(best_acc_epoch)

    
            

        #print(best_result)
        # fout = open('/content/drive/MyDrive/exp_GCN/logs/result_%s.txt' % self.DATASET, 'a')
        # fout.write(str(best_result) + '\n')
        # fout.close()
        return self.labels_test,best_yuce_tag,loss_list,train_acc_list,dev_acc_list,test_acc_list
 

        


if __name__ == '__main__':
    # Hyper Parameters
    ap = argparse.ArgumentParser()
    ap.add_argument("-d", "--dataset", type=str, default="one_month_network_202010",
                    help="Dataset string")
    ap.add_argument("-e", "--epochs", type=int, default=1000,
                    help="Number training epochs")
    ap.add_argument("-hd", "--hidden", type=int, default=16,
                    help="Number hidden units")
    ap.add_argument("-b", "--bases", type=int, default=-1,
                    help="Number of bases used (-1: all)")
    ap.add_argument("-lr", "--learnrate", type=float, default=0.001,
                    help="Learning rate")
    ap.add_argument("-l2", "--l2norm", type=float, default=5e-4,
                    help="L2 normalization of input weights")
    ap.add_argument("-pt", "--pred_type", type=str, default="net",
                    help="prediction method: select from 'net'/'shareu'/'netshareu'")
    args = vars(ap.parse_args(args=[]))
    dataset = args['dataset']

    
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)
    

    trainer = Trainer(args)
    true_labels_test,best_yuce_tag,loss_list,train_acc_list,dev_acc_list,test_acc_list = trainer._train()  



{'dataset': 'one_month_network_202010', 'epochs': 1000, 'hidden': 16, 'bases': -1, 'learnrate': 0.001, 'l2norm': 0.0005, 'pred_type': 'net'}
898999 112376 112374
y:1123749
num_nodes:1435367
support:1
2
Data loaded successfully!
Epoch: 0000 pred_type:Graph train_loss= 623812.7500 train_acc= 0.3556 val_acc= 0.3567 test_acc= 0.3553 time= 6.5729
Epoch: 0001 pred_type:Graph train_loss= 622496.3125 train_acc= 0.3878 val_acc= 0.3678 test_acc= 0.3663 time= 1.0836
Epoch: 0002 pred_type:Graph train_loss= 621136.2500 train_acc= 0.9688 val_acc= 0.8641 test_acc= 0.8647 time= 0.8710
Epoch: 0003 pred_type:Graph train_loss= 619723.0000 train_acc= 0.9688 val_acc= 0.8800 test_acc= 0.8821 time= 0.8442
Epoch: 0004 pred_type:Graph train_loss= 618253.5000 train_acc= 0.9688 val_acc= 0.8804 test_acc= 0.8827 time= 0.8277
Epoch: 0005 pred_type:Graph train_loss= 616724.8750 train_acc= 0.9688 val_acc= 0.8804 test_acc= 0.8827 time= 0.9160
Epoch: 0006 pred_type:Graph train_loss= 615134.8750 train_acc= 0.9688 val_ac

In [3]:
from sklearn.metrics import classification_report
print(classification_report(true_labels_test,best_yuce_tag,digits=4))

              precision    recall  f1-score   support

           0     0.8591    0.9785    0.9149     72448
           1     0.9478    0.7089    0.8111     39926

    accuracy                         0.8827    112374
   macro avg     0.9035    0.8437    0.8630    112374
weighted avg     0.8906    0.8827    0.8780    112374

