In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk import ngrams

# Read the dataset
file_path = '../dataset/sample_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path)
columns_to_keep = ['abstract']
data = data.filter(columns_to_keep).dropna()

# Sample data and split into train and test sets
random_sample = data.sample(n=10000, random_state=42)
train_data, test_data = train_test_split(random_sample, test_size=0.2, random_state=42)

In [2]:
print(train_data)

                                                 abstract
634711  In recent years, the number of applications ut...
745444  Quality-of-Service attributes such as performa...
832989  In ambulatory electroencephalogram (EEG) healt...
598136  Recent studies have shown that the IEEE 802.15...
231785  A linear extension of a poset $P$ is a permuta...
...                                                   ...
253515  On-line signature verification still remains a...
51030   The problem of providing throughput fairness i...
56468   The Direct Access File System (DAFS) is a dist...
623650  Fibrous dysplasia (FD) is a developmental anom...
17567   Despite the prevalence of long noncoding RNA (...

[8000 rows x 1 columns]


In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from text
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply stopword removal to train_data
train_data['abstract'] = train_data['abstract'].apply(remove_stopwords)

print(train_data)

[nltk_data] Downloading package stopwords to C:\Users\Komal
[nltk_data]     Agarwal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                 abstract
634711  recent years, number applications utilizing mo...
745444  Quality-of-Service attributes performance reli...
832989  ambulatory electroencephalogram (EEG) health c...
598136  Recent studies shown IEEE 802.15.4 MAC protoco...
231785  linear extension poset $P$ permutation element...
...                                                   ...
253515  On-line signature verification still remains c...
51030   problem providing throughput fairness wired-cu...
56468   Direct Access File System (DAFS) distributed f...
623650  Fibrous dysplasia (FD) developmental anomaly n...
17567   Despite prevalence long noncoding RNA (lncRNA)...

[8000 rows x 1 columns]


In [5]:
import re
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()

# Apply text cleaning to train_data
train_data['abstract'] = train_data['abstract'].apply(clean_text)

In [6]:
print(train_data)

                                                 abstract
634711  recent years number applications utilizing mob...
745444  qualityofservice attributes performance reliab...
832989  ambulatory electroencephalogram eeg health car...
598136  recent studies shown ieee 802154 mac protocol ...
231785  linear extension poset p permutation elements ...
...                                                   ...
253515  online signature verification still remains ch...
51030   problem providing throughput fairness wiredcum...
56468   direct access file system dafs distributed fil...
623650  fibrous dysplasia fd developmental anomaly nor...
17567   despite prevalence long noncoding rna lncrna g...

[8000 rows x 1 columns]


In [7]:
node2vec_df = pd.read_csv('../dataset/node2vec_embeddings.csv')
def generate_ngrams(text):
    words = text.split()
    bigrams = [' '.join(bg) for bg in ngrams(words, 2)]
    trigrams = [' '.join(tg) for tg in ngrams(words, 3)]
    return words + bigrams + trigrams

# Function to find matching n-grams
def find_matching_ngrams(words):
    matching_words = [word for word in words if word in node2vec_df['node_id'].values]
    return matching_words

# Apply n-grams generation and matching to train_data
train_data['ngrams'] = train_data['abstract'].apply(generate_ngrams)
train_data['matched_ngrams'] = train_data['ngrams'].apply(find_matching_ngrams)

In [8]:
# Create result dataframe
result_df = pd.DataFrame({
    'Abstract': train_data['abstract'],
    'Matched_Ngrams': train_data['matched_ngrams']
})

# Save result to CSV
result_df.to_csv('abstracts_with_matched_ngrams.csv', index=False)