In [None]:
## Generate keywords from title/abstract using TF-IDF and KeyBERT

# Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from keybert import KeyBERT

# Extract keywords from a given text
kw_model = KeyBERT()

def extract_keywords(text, num_keywords=5):
    if pd.isna(text) or text.strip() == "":
        return None
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1,2), stop_words='english', top_n=num_keywords)
    return ", ".join([kw[0] for kw in keywords])

# Function that looks at the dataset, sees if there are keywords, if no keywords: scans text/abstract for keywords and inputs them to the dataset

def process_csv(file_path, output_path="keyword_updated_projects.csv"):
    df = pd.read_csv(file_path)

    # Keywords column check
    if df['Keywords'] is None or df['Keywords'].empty:
        raise ValueError("The CSV file does not contain a 'keywords' column.")

    # Apply keyword extraction where keywords are missing
    for index, row in df.iterrows():
        if pd.isna(row["Keywords"]) or row["Keywords"].strip() == "":
            # Extracting from the abstract first, fallback to title if empty
            check_abstract = extract_keywords(row["Abstract"]) if pd.notna(row["Abstract"]) else None
            check_title = extract_keywords(row["Title"]) if pd.notna(row["Title"]) else None

            # Combine extracted keywords
            keyword_list = []
            if check_abstract:
                keyword_list.extend(check_abstract.split(", "))
            if check_title:
                keyword_list.extend(check_title.split(", "))

            combined_keywords = set(filter(None, keyword_list))
            df.at[index, "Keywords"] = ", ".join(combined_keywords) if combined_keywords else None

    df.to_csv(output_path, index=False)
    print(f"Updated file saved as {output_path}")

process_csv('final_dataset.csv')