In [3]:
import pandas as pd
import requests
import matplotlib.pyplot as plt

# Fetch JSON on update-nature
def fetch_update_nature(url):
    response = requests.get(url)
    if response.status_code == 200:
        json_data = response.json()
        return json_data.get('message', {}).get('cr-labs-updates', [])
    else:
        return []

# Read CSV with publication metadata
read_lines = 47018  # for testing
csv_filename = "OpenAlex_retractions.csv"
df = pd.read_csv(csv_filename, nrows=read_lines)

# Create API calls and get update-nature from Crossref
update_nature_list = []
for doi in df['doi']:
    url = f"https://api.labs.crossref.org/works/{doi}?mailto=christian.hauschke@tib.eu"
    updates = fetch_update_nature(url)
    for update in updates:
        update_nature_list.append(update.get('update-nature'))

# Create DataFrame and plot bar chart if data is available
if update_nature_list:
    update_nature_df = pd.DataFrame(update_nature_list, columns=['update-nature'])
    update_nature_counts = update_nature_df['update-nature'].value_counts()

    # Plotting
    update_nature_counts.plot(kind='bar')
    plt.title('Counts of Update Nature')
    plt.xlabel('Update Nature')
    plt.ylabel('Counts')
    plt.xticks(rotation=45)
    plt.show()
else:
    print("No data available for plotting.")


KeyboardInterrupt: 

In [2]:
print(df)

                                 id  \
0  https://openalex.org/W3001195213   
1  https://openalex.org/W1974047233   
2  https://openalex.org/W2118796952   
3  https://openalex.org/W3010930696   
4  https://openalex.org/W2166000498   

                                        display_name publication_date  \
0  Detection of 2019 novel coronavirus (2019-nCoV...       2020-01-23   
1  DNA methylation age of human tissues and cell ...       2013-01-01   
2  Chimeric Antigen Receptor T Cells for Sustaine...       2014-10-16   
3  Hydroxychloroquine and azithromycin as a treat...       2020-07-01   
4  Primary Prevention of Cardiovascular Disease w...       2013-04-04   

                primary_location_id  \
0    https://openalex.org/S50690046   
1  https://openalex.org/S4210206758   
2    https://openalex.org/S62468778   
3     https://openalex.org/S5248501   
4    https://openalex.org/S62468778   

                   primary_location_display_name  \
0                               Eurosur

In [None]:
import pandas as pd
import requests
import matplotlib.pyplot as plt

# Fetch JSON on update-nature
def fetch_update_nature(url):
    response = requests.get(url)
    if response.status_code == 200:
        json_data = response.json()
        return json_data.get('message', {}).get('cr-labs-updates', [])
    else:
        return []

# Function to process data in chunks
def process_data_chunk(df_chunk):
    update_nature_list = []
    for doi in df_chunk['doi']:
        url = f"https://api.labs.crossref.org/works/{doi}?mailto=christian.hauschke@tib.eu"
        updates = fetch_update_nature(url)
        for update in updates:
            update_nature_list.append(update.get('update-nature'))
    return update_nature_list

# Read CSV with publication metadata in chunks of 1000 lines
def read_csv_in_chunks(csv_filename, chunksize=1000):
    df_chunks = pd.read_csv(csv_filename, chunksize=chunksize)
    for chunk in df_chunks:
        update_nature_list = process_data_chunk(chunk)
        if update_nature_list:
            update_nature_df = pd.DataFrame(update_nature_list, columns=['update-nature'])
            update_nature_counts = update_nature_df['update-nature'].value_counts()

            # Plotting
            update_nature_counts.plot(kind='bar')
            plt.title('Counts of Update Nature')
            plt.xlabel('Update Nature')
            plt.ylabel('Counts')
            plt.xticks(rotation=45)
            plt.show()
        else:
            print("No data available for plotting.")


In [None]:
# Main function to call read_csv_in_chunks
def main():
    csv_filename = "OpenAlex_retractions.csv"
    read_csv_in_chunks(csv_filename)

if __name__ == "__main__":
    main()

In [4]:
import pandas as pd
import requests
import matplotlib.pyplot as plt

# Fetch JSON on update-nature
def fetch_update_nature(url):
    response = requests.get(url)
    if response.status_code == 200:
        json_data = response.json()
        return json_data.get('message', {}).get('cr-labs-updates', [])
    else:
        return []

# Function to process data in chunks
def process_data_chunk(df_chunk, total_lines):
    update_nature_list = []
    for idx, doi in enumerate(df_chunk['doi']):
        url = f"https://api.labs.crossref.org/works/{doi}?mailto=christian.hauschke@tib.eu"
        updates = fetch_update_nature(url)
        for update in updates:
            update_nature_list.append(update.get('update-nature'))
        # Print progress
        print(f"Processed {idx + 1}/{len(df_chunk)} lines. Total processed: {idx + 1 + total_lines}/{total_lines + len(df_chunk)}.")
    return update_nature_list

# Read CSV with publication metadata in chunks of 1000 lines
def read_csv_in_chunks(csv_filename, chunksize=1000):
    total_lines_processed = 0
    df_chunks = pd.read_csv(csv_filename, chunksize=chunksize)
    for chunk in df_chunks:
        update_nature_list = process_data_chunk(chunk, total_lines_processed)
        total_lines_processed += len(chunk)
        if update_nature_list:
            update_nature_df = pd.DataFrame(update_nature_list, columns=['update-nature'])
            update_nature_counts = update_nature_df['update-nature'].value_counts()

            # Plotting
            update_nature_counts.plot(kind='bar')
            plt.title('Counts of Update Nature')
            plt.xlabel('Update Nature')
            plt.ylabel('Counts')
            plt.xticks(rotation=45)
            plt.show()
        else:
            print("No data available for plotting.")

# Main function to call read_csv_in_chunks
def main():
    csv_filename = "OpenAlex_retractions.csv"
    read_csv_in_chunks(csv_filename)

if __name__ == "__main__":
    main()


Processed 1/1000 lines. Total processed: 1/1000.
Processed 2/1000 lines. Total processed: 2/1000.
Processed 3/1000 lines. Total processed: 3/1000.
Processed 4/1000 lines. Total processed: 4/1000.
Processed 5/1000 lines. Total processed: 5/1000.
Processed 6/1000 lines. Total processed: 6/1000.
Processed 7/1000 lines. Total processed: 7/1000.
Processed 8/1000 lines. Total processed: 8/1000.
Processed 9/1000 lines. Total processed: 9/1000.
Processed 10/1000 lines. Total processed: 10/1000.
Processed 11/1000 lines. Total processed: 11/1000.
Processed 12/1000 lines. Total processed: 12/1000.
Processed 13/1000 lines. Total processed: 13/1000.
Processed 14/1000 lines. Total processed: 14/1000.
Processed 15/1000 lines. Total processed: 15/1000.
Processed 16/1000 lines. Total processed: 16/1000.
Processed 17/1000 lines. Total processed: 17/1000.
Processed 18/1000 lines. Total processed: 18/1000.
Processed 19/1000 lines. Total processed: 19/1000.
Processed 20/1000 lines. Total processed: 20/1000

KeyboardInterrupt: 

In [6]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor

# Fetch JSON on update-nature
def fetch_update_nature(doi):
    url = f"https://api.labs.crossref.org/works/{doi}?mailto=christian.hauschke@tib.eu"
    response = requests.get(url)
    if response.status_code == 200:
        json_data = response.json()
        return json_data.get('message', {}).get('cr-labs-updates', [])
    else:
        return []

# Function to process data in chunks
def process_data_chunk(df_chunk):
    update_nature_list = []
    total_dois = len(df_chunk)
    completed_dois = 0
    with ThreadPoolExecutor(max_workers=20) as executor:  # Adjust max_workers as needed
        # Fetch update-nature for each DOI concurrently
        results = executor.map(fetch_update_nature, df_chunk['doi'])
        for result in results:
            if result:
                update_nature_list.extend(result)
            completed_dois += 1
            print(f"Processed {completed_dois}/{total_dois} DOIs.")
    return update_nature_list

# Read CSV with publication metadata in chunks of 1000 lines
def read_csv_in_chunks(csv_filename, chunksize=1000):
    df_chunks = pd.read_csv(csv_filename, chunksize=chunksize)
    for chunk in df_chunks:
        update_nature_list = process_data_chunk(chunk)
        if update_nature_list:
            update_nature_df = pd.DataFrame(update_nature_list, columns=['update-nature'])
            update_nature_counts = update_nature_df['update-nature'].value_counts()

            # Plotting
            update_nature_counts.plot(kind='bar')
            plt.title('Counts of Update Nature')
            plt.xlabel('Update Nature')
            plt.ylabel('Counts')
            plt.xticks(rotation=45)
            plt.show()
        else:
            print("No data available for plotting.")

# Main function to call read_csv_in_chunks
def main():
    csv_filename = "OpenAlex_retractions.csv"
    read_csv_in_chunks(csv_filename)

if __name__ == "__main__":
    main()


Processed 1/1000 DOIs.
Processed 2/1000 DOIs.
Processed 3/1000 DOIs.
Processed 4/1000 DOIs.
Processed 5/1000 DOIs.
Processed 6/1000 DOIs.
Processed 7/1000 DOIs.
Processed 8/1000 DOIs.
Processed 9/1000 DOIs.
Processed 10/1000 DOIs.
Processed 11/1000 DOIs.
Processed 12/1000 DOIs.
Processed 13/1000 DOIs.
Processed 14/1000 DOIs.
Processed 15/1000 DOIs.
Processed 16/1000 DOIs.
Processed 17/1000 DOIs.
Processed 18/1000 DOIs.
Processed 19/1000 DOIs.
Processed 20/1000 DOIs.
Processed 21/1000 DOIs.
Processed 22/1000 DOIs.
Processed 23/1000 DOIs.
Processed 24/1000 DOIs.
Processed 25/1000 DOIs.
Processed 26/1000 DOIs.
Processed 27/1000 DOIs.
Processed 28/1000 DOIs.
Processed 29/1000 DOIs.
Processed 30/1000 DOIs.
Processed 31/1000 DOIs.
Processed 32/1000 DOIs.
Processed 33/1000 DOIs.
Processed 34/1000 DOIs.
Processed 35/1000 DOIs.
Processed 36/1000 DOIs.
Processed 37/1000 DOIs.
Processed 38/1000 DOIs.
Processed 39/1000 DOIs.
Processed 40/1000 DOIs.
Processed 41/1000 DOIs.
Processed 42/1000 DOIs.
P

KeyboardInterrupt: 

In [1]:
import pandas as pd
import requests
import matplotlib.pyplot as plt

# Fetch JSON on update-nature
def fetch_update_nature(urls):
    update_nature_list = []
    for url in urls:
        response = requests.get(url)
        if response.status_code == 200:
            json_data = response.json()
            updates = json_data.get('message', {}).get('cr-labs-updates', [])
            for update in updates:
                update_nature_list.append(update.get('update-nature'))
    return update_nature_list

# Read CSV with publication metadata
csv_filename = "OpenAlex_retractions.csv"
batch_size = 1000
batches = pd.read_csv(csv_filename, chunksize=batch_size)

# Process DOIs in batches
for i, batch in enumerate(batches):
    print(f"Processing batch {i + 1}")
    urls = [f"https://api.labs.crossref.org/works/{doi}?mailto=christian.hauschke@tib.eu" for doi in batch['doi']]
    update_nature_list = fetch_update_nature(urls)
    
    # Create DataFrame and write to file
    if update_nature_list:
        update_nature_df = pd.DataFrame(update_nature_list, columns=['update-nature'])
        update_nature_counts = update_nature_df['update-nature'].value_counts()
        
        # Write counts to file
        output_filename = f"update_nature_counts_batch_{i + 1}.csv"
        update_nature_counts.to_csv(output_filename, header=True)
        
        # Plotting
        update_nature_counts.plot(kind='bar')
        plt.title(f'Counts of Update Nature - Batch {i + 1}')
        plt.xlabel('Update Nature')
        plt.ylabel('Counts')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f"update_nature_plot_batch_{i + 1}.png")
        plt.close()
    else:
        print(f"No data available for batch {i + 1}")

    

Processing batch 1
Processing batch 2
Processing batch 3


KeyboardInterrupt: 

In [2]:
import pandas as pd

# Read CSV file
csv_filename = "OpenAlex_retractions.csv"
columns_to_keep = ['id', 'display_name', 'doi', 'crossref_call', 'primary_topic_domain_display_name', 'primary_location_display_name']
openalexexport_df = pd.read_csv(csv_filename, usecols=columns_to_keep)

# Display first few rows of the DataFrame
print(openalexexport_df.head())

                                 id  \
0  https://openalex.org/W3001195213   
1  https://openalex.org/W1974047233   
2  https://openalex.org/W2118796952   
3  https://openalex.org/W3010930696   
4  https://openalex.org/W2166000498   

                                        display_name  \
0  Detection of 2019 novel coronavirus (2019-nCoV...   
1  DNA methylation age of human tissues and cell ...   
2  Chimeric Antigen Receptor T Cells for Sustaine...   
3  Hydroxychloroquine and azithromycin as a treat...   
4  Primary Prevention of Cardiovascular Disease w...   

                   primary_location_display_name  \
0                               Eurosurveillance   
1              GenomeBiology.com (London. Print)   
2            The New England Journal of Medicine   
3  International Journal of Antimicrobial Agents   
4            The New England Journal of Medicine   

                                                 doi  \
0  https://doi.org/10.2807/1560-7917.es.2020.25.3...   
1  

In [None]:
import pandas as pd
import requests
import time

# Function to fetch JSON on update-nature
def fetch_update_nature(doi):
    url = f"https://api.labs.crossref.org/works/{doi}?mailto=christian.hauschke@tib.eu"
    response = requests.get(url)
    if response.status_code == 200:
        json_data = response.json()
        updates = json_data.get('message', {}).get('cr-labs-updates', [])
        return ";".join([update.get('update-nature') for update in updates])
    else:
        return ""

# Read CSV file and start from specified line/DOI
csv_filename = "OpenAlex_retractions.csv"
start_line = 0  # Set to the line number/DOI from which to start
batch_size = 100
openalexexport_df = pd.read_csv(csv_filename, skiprows=range(1, start_line), nrows=batch_size)

# Extend DataFrame with update_nature column
openalexexport_df['update_nature'] = openalexexport_df['doi'].apply(fetch_update_nature)

# Export CSV file every 1500 DOIs
while not openalexexport_df.empty:
    export_filename = f"retraction_metadata_{start_line + len(openalexexport_df)}.csv"
    openalexexport_df.to_csv(export_filename, index=False)
    print(f"Exported to {export_filename}")
    time.sleep(2)  # Wait for 2 seconds before exporting next batch
    start_line += batch_size
    openalexexport_df = pd.read_csv(csv_filename, skiprows=range(1, start_line), nrows=batch_size)
    openalexexport_df['update_nature'] = openalexexport_df['doi'].apply(fetch_update_nature)



Exported to retraction_metadata_100.csv
Exported to retraction_metadata_200.csv
Exported to retraction_metadata_300.csv
Exported to retraction_metadata_400.csv
Exported to retraction_metadata_500.csv
Exported to retraction_metadata_600.csv
Exported to retraction_metadata_700.csv
Exported to retraction_metadata_800.csv
Exported to retraction_metadata_900.csv
Exported to retraction_metadata_1000.csv
Exported to retraction_metadata_1100.csv
Exported to retraction_metadata_1200.csv
Exported to retraction_metadata_1300.csv
Exported to retraction_metadata_1400.csv
