In [1]:
import requests
import json
import os
import time
import threading
import gzip
import shutil

In [None]:
headers = {
    'x-api-key': 'PB2IUQCkgU7VqjOeJ1v7W3h688WDRxrnaialTY1q',
    'Accept': 'application/json'
}

r1 = requests.get('https://api.semanticscholar.org/datasets/v1/release/2024-10-08', headers = headers).json()
print(json.dumps(r1, indent=2))

In [5]:
# one file check

file_url = files[0]
dataset = '2024-10-8'
file_name = 'temp_citations.json'
print(file_name)

with requests.get(file_url, headers=headers, stream=True) as r:
    r.raise_for_status()
    # Decompress on-the-fly and write to output file
    with gzip.open(r.raw, 'rb') as decompressed_stream:
        with open(file_name, 'wb') as out_file:
            shutil.copyfileobj(decompressed_stream, out_file)


temp_citations.json


In [None]:
#entire file check

dataset = '2024-10-8'

# Rate limiting parameters
RATE_LIMIT = 1  # Max requests per second
MIN_REQUEST_INTERVAL = 1.0 / RATE_LIMIT  # Minimum interval between requests in seconds
last_request_time = 0  # Time of the last request

for idx, file_url in enumerate(files):
    file_name = f'{dataset}_file{idx}.json'
    print(f"Downloading file {idx + 1}/{len(files)}: {file_url}")

    success = False
    retries = 0
    max_retries = 5  # Maximum number of retries for each file
    delay = 1  # Initial delay for exponential backoff

    while not success and retries < max_retries:
        # Rate limiting: Ensure at least 1 second between requests
        current_time = time.time()
        elapsed = current_time - last_request_time
        if elapsed < MIN_REQUEST_INTERVAL:
            sleep_time = MIN_REQUEST_INTERVAL - elapsed
            print(f"Rate limit enforced. Sleeping for {sleep_time:.2f} seconds.")
            time.sleep(sleep_time)
        last_request_time = time.time()  # Update last request time

        try:
            with requests.get(file_url, headers=headers, stream=True) as r:
                r.raise_for_status()
                with gzip.open(r.raw, 'rb') as decompressed_stream:
                    with open(file_name, 'wb') as out_file:
                        shutil.copyfileobj(decompressed_stream, out_file)
            success = True  # Download succeeded
            print(f"Successfully downloaded {file_name}.")
        except Exception as e:
            print(f"Error downloading {file_url}: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff: double the delay
            else:
                print("Max retries reached. Skipping this file.")
    if not success:
        print(f"Failed to download {file_url} after {max_retries} attempts.")
        continue  # Skip to the next file if there's an error


In [25]:
import re

keywords = ['nlp', 'large language model', 'language model', 'llm', 'large language models', 'language models', 'llms']
file_path = '2024_10_8/abstracts/abstracts_file0.json'

def load_file_line_by_line(file_path):
    '''
    return an iterable file
    '''
    with open(file_path, 'r') as f:
        for line in f:
            yield json.loads(line)  # Converts each line from JSON string to a Python dictionary

def return_matching_dict(file_path, corpusid):
    """
    return a dict of matching corpusid
    """
    for item in load_file_line_by_line(file_path):
        if item['corpusid'] == corpusid:
            return item
    return None

def check_abstract(abstract, keywords):
    """
    Inputs:
    abstract: text of abstract
    keywords: a list of keywords
    return whether a abstract contains at least one keyword
    """

    pattern = r'\b(' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b'
    return bool(re.search(pattern, abstract, re.IGNORECASE))



In [None]:
import json

file_path = 'temp_embedding_v1.json'  # Replace with your file path
with open(file_path, 'r', encoding='utf-8') as file:
    for line_number, line in enumerate(file, start=1):
        try:
            data = json.loads(line)
            # Print the entire JSON object
            print(f"Line {line_number}: {json.dumps(data, indent=2)}\n")
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON on line {line_number}: {e}")
            continue  # Skip lines that can't be parsed
        # Optional: Stop after a certain number of lines
        if line_number >= 10:
            break

In [1]:
import time
import os  # Added to handle file paths
import requests
import shutil
import gzip
import itertools
from concurrent.futures import ThreadPoolExecutor, as_completed

# Constants
dataset = '2024-10-8'
headers = {
    'x-api-key': 'PB2IUQCkgU7VqjOeJ1v7W3h688WDRxrnaialTY1q',
    'Accept': 'application/json'
}  # Replace with your API key
RATE_LIMIT = 1  # Max requests per second
MIN_REQUEST_INTERVAL = 1.0 / RATE_LIMIT  # Minimum interval between requests in seconds

# Thread-safe counter for file indexing
index_counter = itertools.count()

# File download function
def download_file(file_url, dataset, headers, folder_path):
    # Use the next value of the atomic counter as the index for this file
    idx = next(index_counter)
    file_name = f'{dataset}_file{idx}.json'
    
    # Create the full file path in the specified folder
    file_path = os.path.join(folder_path, file_name)
    
    print(f"Downloading file {idx + 1}: {file_url}")

    success = False
    retries = 0
    max_retries = 5  # Maximum number of retries for each file
    delay = 1  # Initial delay for exponential backoff
    last_request_time = 0  # Time of the last request

    while not success and retries < max_retries:
        current_time = time.time()
        elapsed = current_time - last_request_time
        if elapsed < MIN_REQUEST_INTERVAL:
            sleep_time = MIN_REQUEST_INTERVAL - elapsed
            print(f"Rate limit enforced. Sleeping for {sleep_time:.2f} seconds.")
            time.sleep(sleep_time)
        last_request_time = time.time()  # Update last request time

        try:
            with requests.get(file_url, headers=headers, stream=True) as r:
                r.raise_for_status()  # Raise an exception if the download failed
                with gzip.open(r.raw, 'rb') as decompressed_stream:
                    with open(file_path, 'wb') as out_file:  # Save to file_path
                        shutil.copyfileobj(decompressed_stream, out_file)
            success = True  # Download succeeded
            print(f"Successfully downloaded {file_path}.")
        except Exception as e:
            print(f"Error downloading {file_url}: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff: double the delay
            else:
                print("Max retries reached. Skipping this file.")
    return success

# Download files with multi-threading
def download_all_files_concurrently(files, dataset, headers, folder_path):
    max_workers = 5  # Adjust based on your system's capabilities
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(download_file, file_url, dataset, headers, folder_path): file_url for file_url in files}
        
        for future in as_completed(future_to_file):
            file_url = future_to_file[future]
            try:
                success = future.result()
                if not success:
                    print(f"Failed to download: {file_url}")
            except Exception as e:
                print(f"Error with future {file_url}: {e}")

def download_multiple_categories(categories, headers, base_folder):
    with ThreadPoolExecutor(max_workers=len(categories)) as executor:  # One thread per category
        future_to_category = {
            executor.submit(download_category, category_name, urls, headers, base_folder): category_name 
            for category_name, urls in categories.items()
        }
        
        for future in as_completed(future_to_category):
            category = future_to_category[future]
            try:
                future.result()  # Wait for the category download to complete
                print(f"Category '{category}' downloaded successfully.")
            except Exception as e:
                print(f"Error downloading category '{category}': {e}")

# Function to download all files in a category
def download_category(category, urls, headers, base_folder):
    dataset = category  # Category can be used as dataset name
    folder_path = os.path.join(base_folder, category)  # Create folder for the category

    # Ensure the folder exists
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    print(f"Downloading category: {category} with {len(urls)} files.")
    download_all_files_concurrently(urls, dataset, headers, folder_path)


In [10]:
import time
import requests

# Constants
headers = {
    'x-api-key': 'PB2IUQCkgU7VqjOeJ1v7W3h688WDRxrnaialTY1q',
    'Accept': 'application/json'
}

# List of dataset endpoints
endpoints = [
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/abstracts',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/authors',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/citations',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/embeddings-specter_v1',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/embeddings-specter_v2',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/paper-ids',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/papers',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/publication-venues',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/s2orc',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/tldrs'
]

# Dictionary to store the results for each dataset
categories = {}
category_names = ['abstracts', 'authors', 'citations', 'embeddings_v1', 'embeddings_v2', 'paper_ids', 'papers', 'publication_venues', 's2orc', 'tldrs']

# Loop through each endpoint, making requests with a 2-second delay
for idx, endpoint in enumerate(endpoints):
    try:
        response = requests.get(endpoint, headers=headers).json()
        categories[category_names[idx]] = response.get('files', [])
        print(f"Successfully fetched data from {endpoint}")
    except Exception as e:
        print(f"Error fetching data from {endpoint}: {e}")
    
    # Wait for 2 seconds before the next request, except for the last iteration
    if idx < len(endpoints) - 1:
        time.sleep(2)


Successfully fetched data from https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/abstracts
Successfully fetched data from https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/authors
Successfully fetched data from https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/citations
Successfully fetched data from https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/embeddings-specter_v1
Successfully fetched data from https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/embeddings-specter_v2
Successfully fetched data from https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/paper-ids
Successfully fetched data from https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/papers
Successfully fetched data from https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/publication-venues
Successfully fetched data from https://api.semanticscholar.org/datasets/v1/r

In [11]:
for category_name, url in categories.items():
    print(f'{category_name}: {len(url)}')

abstracts: 60
authors: 30
citations: 215
embeddings_v1: 941
embeddings_v2: 913
paper_ids: 30
papers: 60
publication_venues: 1
s2orc: 277
tldrs: 30


In [12]:
folder_path1 = '2024_10_8'
for category, url in categories.items():
    folder_path = os.path.join(folder_path1, category)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

In [None]:
import time
import os  # Added to handle file paths
import requests
import shutil
import gzip
import itertools
from concurrent.futures import ThreadPoolExecutor, as_completed


# Constants
headers = {
    'x-api-key': 'PB2IUQCkgU7VqjOeJ1v7W3h688WDRxrnaialTY1q',
    'Accept': 'application/json'
}

# List of dataset endpoints
endpoints = [
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/abstracts',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/authors',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/citations',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/embeddings-specter_v1',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/embeddings-specter_v2',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/paper-ids',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/papers',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/publication-venues',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/s2orc',
    'https://api.semanticscholar.org/datasets/v1/release/2024-10-08/dataset/tldrs'
]

# Dictionary to store the results for each dataset
categories = {}
category_names = ['abstracts', 'authors', 'citations', 'embeddings_v1', 'embeddings_v2', 'paper_ids', 'papers', 'publication_venues', 's2orc', 'tldrs']

# Loop through each endpoint, making requests with a 2-second delay
for idx, endpoint in enumerate(endpoints):
    try:
        response = requests.get(endpoint, headers=headers).json()
        categories[category_names[idx]] = response.get('files', [])
        print(f"Successfully fetched data from {endpoint}")
    except Exception as e:
        print(f"Error fetching data from {endpoint}: {e}")
    
    # Wait for 2 seconds before the next request, except for the last iteration
    if idx < len(endpoints) - 1:
        time.sleep(2)

for category_name, url in categories.items():
    print(f'{category_name}: {len(url)}')

RATE_LIMIT = 1  # Max requests per second
MIN_REQUEST_INTERVAL = 1.0 / RATE_LIMIT  # Minimum interval between requests in seconds


# File download function
def download_file(file_url, dataset, headers, folder_path, index_counter):
    # Use the next value of the atomic counter as the index for this file
    idx = next(index_counter)
    file_name = f'{dataset}_file{idx}.json'
    
    # Create the full file path in the specified folder
    file_path = os.path.join(folder_path, file_name)
    
    print(f"Downloading file {idx + 1}: {file_url}")

    success = False
    retries = 0
    max_retries = 5  # Maximum number of retries for each file
    delay = 1  # Initial delay for exponential backoff
    last_request_time = 0  # Time of the last request

    while not success and retries < max_retries:
        current_time = time.time()
        elapsed = current_time - last_request_time
        if elapsed < MIN_REQUEST_INTERVAL:
            sleep_time = MIN_REQUEST_INTERVAL - elapsed
            print(f"Rate limit enforced. Sleeping for {sleep_time:.2f} seconds.")
            time.sleep(sleep_time)
        last_request_time = time.time()  # Update last request time

        try:
            with requests.get(file_url, headers=headers, stream=True) as r:
                r.raise_for_status()  # Raise an exception if the download failed
                with gzip.open(r.raw, 'rb') as decompressed_stream:
                    with open(file_path, 'wb') as out_file:  # Save to file_path
                        shutil.copyfileobj(decompressed_stream, out_file)
            success = True  # Download succeeded
            print(f"Successfully downloaded {file_path}.")
        except Exception as e:
            print(f"Error downloading {file_url}: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff: double the delay
            else:
                print("Max retries reached. Skipping this file.")
    return success

# Download files with multi-threading
def download_all_files_concurrently(files, dataset, headers, folder_path, index_counter):
    max_workers = 50  # Adjust based on your system's capabilities
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(download_file, file_url, dataset, headers, folder_path, index_counter): file_url for file_url in files}
        
        for future in as_completed(future_to_file):
            file_url = future_to_file[future]
            try:
                success = future.result()
                if not success:
                    print(f"Failed to download: {file_url}")
            except Exception as e:
                print(f"Error with future {file_url}: {e}")


folder_path1 = '2024_10_8'
for category, url in categories.items():
    folder_path = os.path.join(folder_path1, category)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Thread-safe counter for file indexing
    index_counter = itertools.count()
    download_all_files_concurrently(url, category, headers, folder_path, index_counter)
