In [1]:
import os
import pandas as pd 

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
paper_meta = pd.read_csv("data/papers_metadata.csv")
data_clean = pd.read_csv("data/clean.csv")


In [2]:
data_clean['paper_id'] = paper_meta['paper_id'].values
new_df = pd.merge(data_clean, paper_meta, on='paper_id', how='inner')


In [3]:
# Merge info paper A
train_merged = train.merge(
    new_df,
    how="left",
    left_on="paper",
    right_on="paper_id"
)

train_merged = train_merged.rename(columns={
    "text": "text_A",
    "lemmetized": "lemmetized_A",
    "title": "title_A",
    "publication_year": "year_A",
    "publication_date": "date_A",
    "cited_by_count": "cited_count_A",
    "type": "type_A",
    "authors": "authors_A",
    "concepts": "concepts_A"
})


In [4]:
train_merged = train_merged.merge(
    new_df,
    how="left",
    left_on="referenced_paper",
    right_on="paper_id"
)

train_merged = train_merged.rename(columns={
    "text": "text_B",
    "lemmetized": "lemmetized_B",
    "title": "title_B",
    "publication_year": "year_B",
    "publication_date": "date_B",
    "cited_by_count": "cited_count_B",
    "type": "type_B",
    "authors": "authors_B",
    "concepts": "concepts_B"
})

# Hapus kolom paper_id dari merge kedua (karena sudah ada dari merge pertama)
train_merged = train_merged.drop(columns=["paper_id_y"]).rename(columns={"paper_id_x": "paper_id"})


In [5]:
def extract_intro(text, num_words=100):
    if not isinstance(text, str):
        return ""
    return " ".join(text.split()[:num_words])

train_merged["intro_A"] = train_merged["lemmetized_A"].apply(lambda x: extract_intro(x, 100))
train_merged["intro_B"] = train_merged["lemmetized_B"].apply(lambda x: extract_intro(x, 100))


In [7]:
train_merged.columns

Index(['paper', 'referenced_paper', 'is_referenced', 'text_A', 'lemmetized_A',
       'paper_id', 'doi_x', 'title_A', 'year_A', 'date_A', 'cited_count_A',
       'type_A', 'authors_A', 'concepts_A', 'text_B', 'lemmetized_B', 'doi_y',
       'title_B', 'year_B', 'date_B', 'cited_count_B', 'type_B', 'authors_B',
       'concepts_B', 'intro_A', 'intro_B'],
      dtype='object')

In [8]:
referenced_only = train_merged[train_merged["is_referenced"] == 1]
referenced_only.head()


Unnamed: 0,paper,referenced_paper,is_referenced,text_A,lemmetized_A,paper_id,doi_x,title_A,year_A,date_A,...,doi_y,title_B,year_B,date_B,cited_count_B,type_B,authors_B,concepts_B,intro_A,intro_B
153,p0067,p4101,1,"Journal of Ethnic and Cultural Studies\n2020, ...",journal ethnic cultural study vol copyright is...,p0067,https://doi.org/10.29333/ejecs/388,The Perceptions of Primary School Teachers of ...,2020,7/4/2020,...,https://doi.org/10.3991/ijac.v3i1.987,An Educational Model for Asynchronous E-Learni...,2010,2/18/2010,34,article,Dimitris Papachristos; Nikolaos Alafodimos; Ko...,Asynchronous communication; The Internet; Pres...,journal ethnic cultural study vol copyright is...,educational model asynchronous e learning case...
226,p2769,p0127,1,Are Transformers Effective for Time Series For...,transformer effective time series forecasting ...,p2769,https://doi.org/10.1609/aaai.v37i9.26317,Are Transformers Effective for Time Series For...,2023,6/26/2023,...,https://doi.org/10.1109/iccv48922.2021.00986,Swin Transformer: Hierarchical Vision Transfor...,2021,10/1/2021,17318,article,Ze Liu; Yutong Lin; Yue Cao; Han Hu; Yixuan Wei,Transformer; Computer science; Segmentation; A...,transformer effective time series forecasting ...,swin transformer hierarchical vision transform...
310,p2361,p3848,1,"raditional recommender systems, such as those ...",raditional recommender system based content ba...,p2361,https://doi.org/10.1609/aimag.v32i3.2364,Context‐Aware Recommender Systems,2011,9/1/2011,...,https://doi.org/10.1145/371920.372071,Item-based collaborative filtering recommendat...,2001,4/1/2001,8547,article,Badrul Sarwar; George Karypis; Joseph A. Konst...,Research center; Library science; Computer Sci...,raditional recommender system based content ba...,item based collaborative filtering recommendat...
373,p2236,p3312,1,Graph neural networks: A review of methods and...,graph neural network review method application...,p2236,https://doi.org/10.48550/arxiv.1812.08434,Graph Neural Networks: A Review of Methods and...,2018,1/1/2018,...,https://doi.org/10.1007/s10822-016-9938-8,Molecular graph convolutions: moving beyond fi...,2016,8/1/2016,997,article,Steven Kearnes; Kevin McCloskey; Marc Berndl; ...,Graph; Computer science; Artificial intelligen...,graph neural network review method application...,molecular graph convolution moving beyond fing...
483,p2419,p4325,1,Evaluation of Output Embeddings for Fine-Grain...,evaluation output embeddings fine grained imag...,p2419,https://doi.org/10.1109/cvpr.2015.7298911,Evaluation of output embeddings for fine-grain...,2015,6/1/2015,...,https://doi.org/10.1109/iccv.2011.6126386,Human action recognition by learning bases of ...,2011,11/1/2011,617,article,Bangpeng Yao; Xiaoye Jiang; Aditya Khosla; And...,Action (physics); Action recognition; Computer...,evaluation output embeddings fine grained imag...,human action recognition learning base action ...


In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
model = SentenceTransformer('allenai-specter')  # Pretrained SciBERT for scientific paper similarity





In [10]:
# Gabungkan judul dan intro untuk representasi lengkap masing-masing dokumen
train_merged["combined_text_A"] = train_merged["title_A"].fillna('') + ". " + train_merged["intro_A"].fillna('')
train_merged["combined_text_B"] = train_merged["title_B"].fillna('') + ". " + train_merged["intro_B"].fillna('')


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

batch_size = 64
all_similarities = []

# Hanya ambil 1000 data pertama
train_subset = train_merged.iloc[:1000].copy()

for i in range(0, len(train_subset), batch_size):
    batch_text_A = train_subset["combined_text_A"].iloc[i:i+batch_size].tolist()
    batch_text_B = train_subset["combined_text_B"].iloc[i:i+batch_size].tolist()
    
    # Encoding ke embedding
    emb_A = model.encode(batch_text_A, convert_to_tensor=True, show_progress_bar=False)
    emb_B = model.encode(batch_text_B, convert_to_tensor=True, show_progress_bar=False)

    # Hitung similarity antar pasangan (1-1)
    sims = cosine_similarity(emb_A.cpu(), emb_B.cpu())
    batch_sim = [sims[j, j] for j in range(len(sims))]
    
    all_similarities.extend(batch_sim)

# Kalau mau tempel ke DataFrame:
train_subset["specter2_sim"] = all_similarities


KeyboardInterrupt: 