In [2]:
# ===============================================================
# 🚀 NASA Space Biology Keyword Extraction Pipeline
# ===============================================================

# 1️⃣ Install dependencies
!pip install keybert pandas tqdm sentence-transformers

# 2️⃣ Mount Google Drive
from google.colab import drive
import os

drive.mount('/content/drive')
base_dir = "/content/drive/MyDrive/nasa"
os.makedirs(base_dir, exist_ok=True)

# 3️⃣ Import modules
import pandas as pd
import json
from keybert import KeyBERT
from tqdm import tqdm

# 4️⃣ Load your CSV
csv_path = f"{base_dir}/SB_publication_PMC.csv"  # ensure this file exists in your Drive
df = pd.read_csv(csv_path)

print(f"✅ Loaded CSV with {len(df)} entries")

# 5️⃣ Initialize KeyBERT model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

# 6️⃣ Extract keywords and build JSON structure
json_data = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    title = str(row['Title'])
    link = str(row['Link'])

    try:
        keywords = kw_model.extract_keywords(
            title,
            keyphrase_ngram_range=(1, 3),
            stop_words='english',
            top_n=8
        )
        keyword_list = [kw for kw, _ in keywords]
    except Exception as e:
        keyword_list = []

    json_data.append({
        "title": title,
        "link": link,
        "keywords": keyword_list
    })

# 7️⃣ Save to JSON
json_path = f"{base_dir}/nasa_space_bio_keywords.json"
with open(json_path, "w") as f:
    json.dump(json_data, f, indent=4)

print(f"✅ JSON file created successfully: {json_path}")
print(f"📁 Saved {len(json_data)} entries to nasa_space_bio_keywords.json")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Loaded CSV with 607 entries


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 607/607 [01:30<00:00,  6.71it/s]

✅ JSON file created successfully: /content/drive/MyDrive/nasa/nasa_space_bio_keywords.json
📁 Saved 607 entries to nasa_space_bio_keywords.json



