<a href="https://colab.research.google.com/github/lewiskhu/colabtools/blob/main/paper_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Step 1: Get all papers from Google Scholar (handles pagination)
def get_papers_from_scholar(user_id, delay=2):
    papers = []
    start = 0
    while True:
        url = f"https://scholar.google.com/citations?hl=en&user={user_id}&view_op=list_works&sortby=pubdate&cstart={start}&pagesize=100"
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        rows = soup.select('.gsc_a_tr')
        if not rows:
            break
        for row in rows:
            title_element = row.select_one('.gsc_a_at')
            title = title_element.text if title_element else ''

            # The journal/conference is usually the second .gs_gray element
            gs_grays = row.select('.gs_gray')
            journal = gs_grays[1].text if len(gs_grays) > 1 else ''

            papers.append({'title': title, 'journal': journal})
        start += 100
        time.sleep(delay)  # Be polite to Google Scholar
    return papers

# Step 2: Check journal index status using Clarivate MJL or a local CSV
def is_indexed(journal_name, sci_journals, ssci_journals, ahci_journals):
    name = journal_name.strip().lower()
    if name in sci_journals:
        return "SCI"
    elif name in ssci_journals:
        return "SSCI"
    elif name in ahci_journals:
        return "AHCI"
    return "Not Indexed"

# Utility to load journal lists (CSV with journal names, one per line)
def load_journal_list(path):
    journals = set()
    try:
        with open(path, encoding='utf-8') as f:
            reader = csv.reader(f)
            for row in reader:
                if row:
                    journals.add(row[0].strip().lower())
    except FileNotFoundError:
        print(f"Warning: Journal list file not found at {path}. Using empty list.")
        # Return an empty set if the file is not found, allowing the rest of the code to run
        return set()
    return journals

def main():
    user_id = "N4K7_bkAAAAJ"  # Replace with your Scholar user ID
    sci_journals = load_journal_list("sci_journals.csv")
    ssci_journals = load_journal_list("ssci_journals.csv")
    ahci_journals = load_journal_list("ahci_journals.csv")

    papers = get_papers_from_scholar(user_id)
    with open("papers_with_index.csv", "w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Journal", "Index"])
        for paper in papers:
            index_status = is_indexed(paper['journal'], sci_journals, ssci_journals, ahci_journals)
            writer.writerow([paper['title'], paper['journal'], index_status])
            print(f"{paper['title']} | {paper['journal']} | {index_status}")

if __name__ == "__main__":
    main()

In [None]:
# Create empty placeholder CSV files for the journal lists
# Replace these with your actual journal lists
with open('sci_journals.csv', 'w') as f:
    pass
with open('ssci_journals.csv', 'w') as f:
    pass
with open('ahci_journals.csv', 'w') as f:
    pass

print("Placeholder journal list files created.")