In [3]:
import pandas as pd
import torch
torch.__version__
import torch.nn as nn
import torch_geometric
from torch_geometric.loader import NeighborLoader
from torch_geometric.loader import NeighborSampler
from torch_geometric.utils import degree
from tqdm import tqdm
import copy
from torch_geometric.nn import GCNConv,SAGEConv
from torch_geometric.nn import GATConv
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
class KGDataset(torch.utils.data.Dataset):
    def __init__(self, triples, entity2id, relation2id):
        self.triples = triples
        self.entity2id = entity2id
        self.relation2id = relation2id

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        head, relation, tail = self.triples[idx]
        head_id = self.entity2id[head]
        relation_id = self.relation2id[relation]
        tail_id = self.entity2id[tail]
        return head_id, relation_id, tail_id

class predDataset(torch.utils.data.Dataset):
    def __init__(self, triples, entity2id, relation2id):
        self.triples = triples
        self.entity2id = entity2id
        self.relation2id = relation2id

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        head, relation = self.triples[idx]
        head_id = self.entity2id[head]
        relation_id = self.relation2id[relation]
        return head_id, relation_id

In [5]:
def Dataset2Graph(dataset):
    m={}
    e_id=[]
    edge_index=[]
    for idx in range(len(dataset)):
        head_id, relation_id, tail_id =dataset[idx]
        if m.get((head_id,relation_id,tail_id),0)!=1:
            edge_index.append([head_id, tail_id])  # 添加头实体和尾实体作为连接
            e_id.append([relation_id])
            m[(head_id,relation_id,tail_id)]=1
    # 转换为tensor
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    e_id = torch.tensor(e_id,dtype=torch.long).squeeze()  # 保持关系ID作为边属性

    graph = torch_geometric.data.Data(edge_index=edge_index,e_id=e_id)
    return graph


In [6]:
train_set = pd.read_csv("MyData/train_R.csv")[['Head', 'Relation', 'Tail']]
va_set = pd.read_csv("MyData/vali_R.csv")[['Head', 'Relation', 'Tail']]
train = [tuple(row) for row in train_set.itertuples(index=False)]
va = [tuple(row) for row in va_set.itertuples(index=False)]
test_set=pd.read_csv("MyData/test.csv")[['Head','Relation']]
test=[tuple(row) for row in test_set.itertuples(index=False)]
relation2text = pd.read_csv("MyData/relation.csv", encoding='UTF-8')[['Name', 'Text', 'ID']]
entity2text = pd.read_csv("MyData/entity.csv", encoding='UTF-8')[['Name', 'Text', 'ID']]
eDic = entity2text.set_index('Name')['ID'].to_dict()
rDic = relation2text.set_index('Name')['ID'].to_dict()
"""
此时：
va,test,train=[tuple]
xDic=['name':ID]

"""
KG=KGDataset(train,eDic,rDic)
predict=predDataset(train,eDic,rDic)
train=KGDataset(train,eDic,rDic)
vali=KGDataset(va,eDic,rDic)
test=predDataset(test,eDic,rDic)
graph=Dataset2Graph(KG)
graph.num_nodes=len(eDic)
graph.num_edges=len(graph.edge_index[0])
graph.n_id =torch.tensor(list(eDic.values()))
graph=graph.to(device)
#graph.x=torch.tensor([[i] for i in range(len(eDic))])
graph

Data(edge_index=[2, 1252550], e_id=[1252550], num_nodes=249746, num_edges=1252550, n_id=[249746])

In [17]:
class Frame(nn.Module):
    def vali_model(self,vali):
        self.eval()
        total_correct = 0
        total_samples = 0
        vali_loader = torch.utils.data.DataLoader(vali,batch_size=self.batch_size, shuffle=False)
        with torch.no_grad():
            for head, relation, tail in vali_loader:
                head, relation, tail = head.to(self.device), relation.to(self.device), tail.to(self.device)
                #print(head.size(),head)
                sub=self.get_sub(head)
                #开始前向
                new_graph=self.network(sub)
                sorted_tensor, idx = new_graph.n_id.sort()
                positions = torch.searchsorted(sorted_tensor, head)
                x=new_graph.x[positions]
                r=self.network.relation_embedding(relation)
                x=torch.cat((x,r),dim=1)
                output = self.fc(x)
                predicted_tail = torch.argmax(output, dim=1)
                total_correct += (predicted_tail == tail).sum().item()
                total_samples += len(tail)
        accuracy = total_correct / total_samples
        return accuracy 
    
    def predict_model(self,test,entity2id,top_k=10,name=""):
        self.eval()
        id2entity = {v: k for k, v in entity2id.items()}
        results = []
        test_loader = torch.utils.data.DataLoader(test,batch_size=self.batch_size, shuffle=False)
        with torch.no_grad():
            for head, relation in test_loader:
                head, relation = head.to(self.device), relation.to(self.device)
                #print(head.size(),head)
                sub=self.get_sub(head)
                #开始前向
                new_graph=self.network(sub)
                sorted_tensor, idx = new_graph.n_id.sort()
                positions = torch.searchsorted(sorted_tensor, head)
                x=new_graph.x[positions]
                r=self.network.relation_embedding(relation)
                x=torch.cat((x,r),dim=1)
                output = self.fc(x)
                _, top_indices = torch.topk(output, top_k, dim=1)

                for idx_list in top_indices:

                    top_entities = [id2entity[idx.item()] for idx in idx_list]
                    results.append(top_entities)
        results = pd.DataFrame(results)
        r=pd.concat([test_set,results],axis=1)
        r.to_csv("./r/result"+name+".tsv",header=None,sep='\t',index=False)
        #return results
    
    
    def __init__(self, graph,num_entities,num_relations,
                 lr=0.0001, embed_dim=16,batch_size=512,
                 num_epochs=25, num_layers=4,heads=2):
        super(Frame, self).__init__()
        self.graph=graph
        self.reversed_graph = copy.deepcopy(self.graph)
        # 反转edge_index
        self.reversed_graph.edge_index = self.reversed_graph.edge_index[[1, 0], :]
        self.lr=lr
        self.batch_size=batch_size
        self.num_epochs=num_epochs
        self.num_layers=num_layers
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.num_entities =num_entities
        self.num_relations =num_relations
        self.embed_dim=embed_dim
       
        
        
        self.loss = torch.nn.BCEWithLogitsLoss()
        #待填
        self.network=network(num_entities=num_entities,num_relations=num_relations,embed_dim=embed_dim,num_layers=num_layers,heads=heads).to(self.device)
        self.fc = nn.Linear(embed_dim*2, self.num_entities).to(self.device)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        self.criterion = nn.CrossEntropyLoss()
        
    
        
    def train_model(self,train,vali,test,entity2id,flag=False):
        best_acc=0
        for epoch in range(self.num_epochs):
            
            self.train()
            train_loader = torch.utils.data.DataLoader(train,batch_size=self.batch_size, shuffle=True)
            progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{self.num_epochs}', leave=True)
            for head, relation, tail in progress_bar:
                for _ in range(10):
                    torch.cuda.empty_cache()
                self.optimizer.zero_grad()
                head, relation, tail = head.to(self.device), relation.to(self.device), tail.to(self.device)
                sub=self.get_sub(head)
                #开始前向
                new_graph=self.network(sub)
                sorted_tensor, idx = new_graph.n_id.sort()
                positions = torch.searchsorted(sorted_tensor, head)
                x=new_graph.x[positions]
                r=self.network.relation_embedding(relation)
                x=torch.cat((x,r),dim=1)
                output = self.fc(x)
                loss1 = self.criterion(output, tail)
                loss2=self.criterion(output/1.1, tail)
                if flag:
                    sub=self.get_sub(tail)
                    #开始前向
                    new_graph=self.network(sub)
                    sorted_tensor, idx = new_graph.n_id.sort()
                    positions = torch.searchsorted(sorted_tensor, tail)
                    x=new_graph.x[positions]
                    r=self.network.relation_embedding(relation)
                    x=torch.cat((x,-r),dim=1)
                    output = self.fc(x)
                    loss3 = self.criterion(output, head)
                    #print(loss)
                    loss=loss1*0.8+loss2*0.1+loss3*0.1
                else:
                    loss=loss1*0.95+loss2*0.05
                loss.backward()
                #loss.backward()
                self.optimizer.step()
                
                
                
            self.eval()
            acc = self.vali_model(vali)
            if best_acc<acc:
                best_acc=acc
                self.predict_model(test,entity2id=entity2id,name=str(acc))
                
            lr=self.lr
            if acc>0.5:
                self.lr=0.001
            elif acc>0.40:
                self.lr=0.005
            elif acc>0.35:
                self.lr=0.03
            elif acc>0.3:
                self.lr=0.05
            if lr!=self.lr:
                for param_group in self.optimizer.param_groups:
                        param_group['lr'] =self.lr
            tqdm.write(f'Epoch {epoch + 1}, Loss: {loss.item():.3f}, Acc: {acc * 100 :.2f}%')
            progress_bar.refresh()
    
    def get_sub(self,head):
        # 使用torch.isin生成一个布尔掩码，其中包含的位置为True
        mask = torch.isin(self.graph.n_id, head.to(self.device))
        # 使用torch.nonzero找到True值的索引
        nodes= torch.nonzero(mask, as_tuple=True)[0]
        loader = NeighborLoader(self.graph,
                                input_nodes=nodes, 
                                num_neighbors=[-1],
                                batch_size=self.graph.num_nodes,
                                shuffle=False)
        for batch in loader:
                return batch
            # 交换行来反转方向
        reversed_loader = NeighborLoader(self.reversed_graph,
                                input_nodes=nodes, 
                                num_neighbors=[-1],
                                batch_size=self.graph.num_nodes,
                                shuffle=False)
        for batch, reversed_batch in zip(loader, reversed_loader):
            #不会循环，仅一次
            # 合并两个批次的数据
            all=torch.tensor(list(set(batch.n_id.tolist()+reversed_batch.n_id.tolist())))
            sub=self.graph.subgraph(all).to(device)
            return sub
    
        
class network(nn.Module):
    def __init__(self,num_entities,num_relations,embed_dim,num_layers,heads):
        self.embed_dim=embed_dim
        self.num_relations=num_relations
        super(network, self).__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.entity_embedding = nn.Embedding(num_entities, embed_dim).to(self.device)
        self.relation_embedding = nn.Embedding(num_relations, embed_dim).to(self.device)
        
        self.gat_layers = torch.nn.ModuleList([
            GATConv(embed_dim, embed_dim, heads=heads)] )
        for _ in range(num_layers-2):
            self.gat_layers.append(GATConv(embed_dim*heads, embed_dim*heads, heads=heads,concat=False))
        self.gat_layers.append(GATConv(embed_dim*heads, embed_dim, heads=heads,concat=False))

        
        self.layers = nn.ModuleList()
        
        for _ in range(int(num_layers/2)):
            self.layers.append(GCNConv(embed_dim, embed_dim))
            self.layers.append(SAGEConv(embed_dim, embed_dim))


    def forward(self,graph):
        x=self.entity_embedding(graph.n_id) 

        for layer in self.gat_layers:
            #print(x.size())
            x = F.relu(layer(x, graph.edge_index))

        graph.x=x
        
        return graph


frame=Frame(graph=graph,num_entities=len(eDic),num_relations=len(rDic),
           lr=0.1, embed_dim=8,batch_size=1000,
                 num_epochs=50, num_layers=3,heads=1
           )

frame.train_model(train,vali,entity2id=eDic,test=test,flag=False)
#result=frame.predict_model(test,entity2id=eDic,name=str(2.1))
#result = pd.DataFrame(result)
#r=pd.concat([test_set,result],axis=1)
#r.to_csv("result.tsv",header=None,sep='\t',index=False)