In [5]:
import pandas as pd
import requests
from google.colab import files

# Define constants
PUBLICATION_NAME = "Machine Learning AND choice major careers"
MAX_RESULTS = 9000    # Total number of desired results
RESULTS_PER_PAGE = 100  # Number of results per page

# Function to search and extract information from CrossRef with pagination
def search_publication_crossref(publication_name, max_results=100, results_per_page=100):
    url_template = "https://api.crossref.org/works"
    info = []
    start = 0

    while start < max_results:
        url = f"{url_template}?query.title={publication_name}&rows={results_per_page}&offset={start}"
        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()
            publications = data.get("message", {}).get("items", [])

            if not publications:
                break

            for publication in publications:
                title = publication.get("title", ["Title not found"])[0]
                link = publication.get("URL", "Link not found")
                authors = ", ".join([author.get("given", "") + " " + author.get("family", "") for author in publication.get("author", [])])
                abstract = publication.get("abstract", None)
                year = publication.get("published-print", {}).get("date-parts", [[None]])[0][0] or \
                       publication.get("published-online", {}).get("date-parts", [[None]])[0][0]

                # Add only if all required fields are present and do not contain "not found" or "not available"
                if title != "Title not found" and link != "Link not found" and authors and abstract:
                    info.append({
                        "Title": title,
                        "Authors": authors,
                        "Link": link,
                        "Abstract": abstract,
                        "Publication Year": year
                    })

            start += results_per_page
        else:
            print(f"Error with request: {response.status_code}")
            break

    return info

# Search and extract information
publication_info = search_publication_crossref(PUBLICATION_NAME, max_results=MAX_RESULTS, results_per_page=RESULTS_PER_PAGE)

# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame(publication_info)

# Print results in Google Colab
print(df)

# Save the results to an Excel file
output_file = "publication_results.xlsx"
df.to_excel(output_file, index=False)

# Download the file
files.download(output_file)


                                                  Title  \
0     Predicting STEM Major Choice: a Machine Learni...   
1       Conventional Machine Learning for Social Choice   
2        Machine learning for product choice prediction   
3     Design choice and machine learning model perfo...   
4           No free theory choice from machine learning   
...                                                 ...   
1791  Volcanic Ash Classification Through Machine Le...   
1792  PARKINSON’S DISEASE PREDICTION USING MACHINE L...   
1793  Machine learning using magnetic stochastic syn...   
1794            Machine Learning for Internet of Things   
1795  Diabetes Mellitus Prediction using Machine Lea...   

                                                Authors  \
0     Chi-Ning Chang, Shuqiong Lin, Oi-Man Kwok, Gua...   
1               John Doucette, Kate Larson, Robin Cohen   
2                              Josué Martínez-Garmendia   
3     Rosa Arboretti, Riccardo Ceccato, Luca Pegorar...

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
/content/drive/MyDrive/Colab Notebooks/Web_Scraper.ipynb