In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("citation-cooked.csv")
df.head()

Unnamed: 0,Title,Authors,Year,Venue,Index,References,Abstract
0,OQL[C++]: Extending C++ with an Object Query C...,José A. Blakeley,1995,Modern Database Systems,0,,
1,Transaction Management in Multidatabase Systems.,"Yuri Breitbart,Hector Garcia-Molina,Abraham Si...",1995,Modern Database Systems,1,,
2,Overview of the ADDS System.,"Yuri Breitbart,Tom C. Reyes",1995,Modern Database Systems,2,,
3,Multimedia Information Systems: Issues and App...,"Stavros Christodoulakis,Leonidas Koveos",1995,Modern Database Systems,3,,
4,Active Database Systems.,"Umeshwar Dayal,Eric N. Hanson,Jennifer Widom",1995,Modern Database Systems,4,,


In [3]:
sorted_df = df.sort_values(by=['Year'], ascending=False).reset_index(drop=True)
sorted_df.head()   

Unnamed: 0,Title,Authors,Year,Venue,Index,References,Abstract
0,An engine fault diagnosis system using intake ...,"Jian-Da Wu,Cheng-Kai Huang",2011,Expert Syst. Appl.,1492668,,
1,The acceptance and use of customer relationshi...,"Jung-Chi Pai,Fu-Ming Tu",2011,Expert Syst. Appl.,1492669,,
2,Two-stage structural damage detection using fu...,"Shao-Fei Jiang,Chun-Ming Zhang,Shuai Zhang",2011,Expert Syst. Appl.,1492654,,
3,A modular Decision Support System for optimum ...,"Kaveh Khalili Damghani,Soheil Sadi-Nezhad,M. B...",2011,Expert Syst. Appl.,1492655,,
4,Introduction of evidential contribution measur...,"Malcolm J. Beynon,Rhys Andrews",2011,Expert Syst. Appl.,1492656,,


In [4]:
# Select a subset of the data
n_rows = 1000
from_ids, to_ids = [], []
cnt = 0
for i, row in sorted_df.iterrows():
    refs = row["References"]
    if pd.isna(refs):
        continue
    refs = refs.split(", ")
    for ref in refs:
        from_ids.append(row["Index"])
        to_ids.append(int(ref))
    cnt += 1
    if cnt > n_rows:
        break
all_ids = list(set(from_ids + to_ids))
print(len(from_ids))
print(f"{len(all_ids)} / {df.shape[0]}")

6849
6897 / 1511035


In [5]:
all_ids = sorted(all_ids)

In [6]:
new_id = {old: new for new, old in enumerate(all_ids)}
new_df = df[df["Index"].isin(all_ids)]
new_df["Index"] = new_df["Index"].map(new_id)
len(new_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["Index"] = new_df["Index"].map(new_id)


6897

In [7]:
new_from_ids = [new_id[x] for x in from_ids]
new_to_ids = [new_id[x] for x in to_ids]

In [8]:
import dgl
import torch

g = dgl.graph((torch.tensor(new_from_ids), torch.tensor(new_to_ids)))
g = dgl.to_bidirected(g)

In [1]:
from tqdm import tqdm
from transformers import AutoTokenizer, BertModel
import torch

bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
bert_model = BertModel.from_pretrained("google-bert/bert-base-uncased")

def get_embedding_bert(texts, model, tokenizer, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        inputs = tokenizer(texts[i:i+batch_size], return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        embeddings.append(outputs.pooler_output)
    return torch.cat(embeddings)

In [1]:
import torch.nn.functional as F
from torch import Tensor
import torch
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

input_texts = [
    "what is the capital of China?",
    "how to implement quick sort in python?",
    "Beijing",
    "sorting algorithms"
]

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
model = AutoModel.from_pretrained("thenlper/gte-small")



from tqdm import tqdm
def get_embedding(texts, model, tokenizer, batch_size=64):
    outputs = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        batch_inputs = tokenizer(batch_texts, max_length=512, padding=True, truncation=True, return_tensors="pt")
        batch_outputs = model(**batch_inputs)
        batch_embeddings = average_pool(batch_outputs.last_hidden_state, batch_inputs['attention_mask'])
        outputs.append(batch_embeddings)
    return torch.cat(outputs)

cpu


In [12]:
corpus = new_df["Title"].fillna("") + "\n" + new_df["Abstract"].fillna("")
corpus_ids = new_df["Index"].tolist()
corpus = corpus.tolist()
prompted_corpus = []
for sen in corpus:
    title, abstract = sen.split("\n")
    title = title.strip()
    abstract = abstract.strip()
    if len(title) > 0 and len(abstract) > 0:
        prompt = f"Title: {title}\nAbstract: {abstract}\n"
    elif len(title) > 0:
        prompt = f"Title: {title}\n"
    elif len(abstract) > 0:
        prompt = f"Abstract: {abstract}\n"
    else:
        raise ValueError("Both title and abstract are empty")
    prompted_corpus.append(prompt)

In [15]:
prompted_corpus_dict = {
    i: prompted_corpus[i] for i in range(len(prompted_corpus))
}
import json
with open("prompted_corpus.json", "w") as f:
    json.dump(prompted_corpus_dict, f, indent=2)

In [None]:
import json
import numpy as np

prompts = json.load(open("prompted_corpus.json"))
# with open('extracted_prompts.npy', 'rb') as f:
#     all_extracted = np.load(f)
# print(all_extracted.shape)
all_extracted = []
texts = []
for i in range(len(all_extracted), min(len(all_extracted) + 640, len(prompts))):
    texts.append(prompts[str(i)])
temp_extracted = get_embedding(texts, model, tokenizer)
temp_extracted = temp_extracted.cpu().detach().numpy()
all_extracted = np.concatenate([all_extracted, temp_extracted], axis=0)

with open('extracted_prompts.npy', 'wb') as f:
    np.save(f, all_extracted)

In [14]:
# node_features = get_embedding(prompted_corpus, model, tokenizer)
node_features = get_embedding_bert(prompted_corpus, bert_model, bert_tokenizer)

In [9]:
import numpy as np
import torch

node_features = torch.tensor(np.load("extracted_prompts.npy"))
# print(node_features.shape)
g.ndata['feat'] = node_features
from dgl.data.utils import save_graphs
name =  "citation_6897_384.dgl"
save_graphs(name, g)