In [None]:
import pandas as pd
import requests
from itertools import chain
import pickle
import numpy as np
# import matplotlib.pyplot as plt
import time

In [None]:
#get works that cite the AlphaFold paper
url_with_cursor = "https://api.openalex.org/works?filter=cites:W3177828909,type:article&cursor={}"

df = pd.DataFrame()
cursor = '*'
while cursor:
    url = url_with_cursor.format(cursor)
    response = requests.get(url).json()
    data = pd.DataFrame(response['results'])
    df = pd.concat([df, data])
    cursor = response['meta']['next_cursor']

In [None]:
df.reset_index(inplace=True)
df = df.drop(columns={'index'})

In [None]:
truncated_index = df[df['is_authors_truncated']==True].index
for i in truncated_index:
    id = df.loc[i, 'id'].split('/')[-1]
    url = f"https://api.openalex.org/works/{id}"
    response = requests.get(url).json()
    authorships = response['authorships']
    df.at[i, 'authorships'] = authorships

In [None]:
af_authors = []
for i in range(len(df)):
    for author in df.iloc[i]['authorships']:
        af_authors.append(author['author']['id'])
af_authors = list(set(af_authors))

In [None]:
# I can filter using multiple IDs by using the '|' operator. (Up to 50 values)

def build_author_works_url(id_list):
    # specify endpoint
    endpoint = 'works'

    ids = '|'.join(id_list)

    # build the 'filter' parameter
    filters = (
      f'author.id:{ids}',
      'type:article', #excludes book-chapter, dissertation, book, dataset, paratext, other, reference-entry, report, peer-review, standard, editorial, erratum, grant, letter
    )

    # put the URL together
    return f'https://api.openalex.org/{endpoint}?filter={",".join(filters)}'

In [None]:
def get_publications(id_longlist, done_ids=None, start=0):
    if done_ids is None:
        done_ids = []
    
    id_longlist = list(set(id_longlist) - set(done_ids))
    id_chunks = [id_longlist[i:i + 50] for i in range(0, len(id_longlist), 50)]
    counter = start

    for id_list in id_chunks:
        data = pd.DataFrame()  # Initialize data for each chunk
        url_with_cursor = build_author_works_url(id_list) + "&cursor={}"
        cursor = '*'
        
        while cursor:
            url = url_with_cursor.format(cursor)
            try:
                response = requests.get(url)
                response.raise_for_status()
                response_json = response.json()
                chunk_data = pd.DataFrame(response_json['results'])
                data = pd.concat([data, chunk_data], ignore_index=True)
                cursor = response_json['meta']['next_cursor']
            except requests.RequestException as e:
                print(f"Error fetching data: {e}")
                break
        
        # data = data.drop_duplicates()  # Remove any duplicates
        
        counter += 1
        print(f'Finished {counter * 50} out of {len(id_longlist)} authors')
        print(f"Size of data before saving: {data.memory_usage().sum() / 1e6} MB")
        
        # Save the dataframe for this chunk
        with open(f'af_authors_works_0724_interim_{counter}.pkl', 'wb') as file:
            pickle.dump(data, file)
        
        done_ids += id_list
        
        # Save done_ids after each chunk
        with open('done_ids.pkl', 'wb') as file:
            pickle.dump(done_ids, file)

    return done_ids

In [None]:
done_ids = get_publications(af_authors)

In [None]:
# Load one file to check
with open('af_authors_works_0724_interim_1.pkl', 'rb') as file:
    data = pickle.load(file)

In [None]:
with open('truncated_works_short.pkl', 'rb') as file:
    truncated_works = pickle.load(file)

In [None]:
def extract_author_ids(authorships):
    author_ids = []
    for author in authorships:
        author_ids.append(author['author']['id'])
    return author_ids

def extract_journal(location):
    if isinstance(location, float):
        if np.isnan(location): return pd.Series([np.nan,np.nan])
    if location is None:
        return pd.Series([np.nan,np.nan])
    if location['source'] is None:
        return pd.Series([np.nan,np.nan])
    else:
        id = location['source']['id']
        journal = location['source']['display_name']
    return pd.Series([id, journal])

def extract_topic(topic):
    if isinstance(topic, float):
        if np.isnan(topic): return pd.Series([np.nan, np.nan, np.nan, np.nan])
    if topic is None:
        return pd.Series([np.nan, np.nan, np.nan, np.nan])
    topic_name = topic['display_name']
    subfield_name = topic['subfield']['display_name']
    field_name = topic['field']['display_name']
    domain_name = topic['domain']['display_name']
    return pd.Series([topic_name, subfield_name, field_name, domain_name])

In [None]:
with open('truncated_works_ids.pkl', 'rb') as file:
    truncated_works_ids = pickle.load(file)

In [None]:
i = truncated_works_ids[0]
id = i.split('/')[-1]
url = f"https://api.openalex.org/works/{id}"
response = requests.get(url).json()
time.sleep(0.01)
authorships = response['authorships']
author_ids = extract_author_ids(authorships)

In [None]:
df = pd.DataFrame(columns=['id', 'author_ids'])
df = pd.concat([df, pd.DataFrame({'id': [id], 'author_ids': [author_ids]})], ignore_index=True)

In [None]:
df = pd.DataFrame(columns=['id', 'author_ids'])
notdonelist = []
for i in truncated_works_ids:
    id = i.split('/')[-1]
    url = f"https://api.openalex.org/works/{id}"
    try:
        response = requests.get(url).json()
        time.sleep(0.01)
        authorships = response['authorships']
        author_ids = extract_author_ids(authorships)
        df = pd.concat([df, pd.DataFrame({'id': [id], 'author_ids': [author_ids]})], ignore_index=True)
        print(f'{id} done')
    except:
        print(f'{id} skipped')
        notdonelist.append(i)

In [None]:
for i in notdonelist:
    id = i.split('/')[-1]
    url = f"https://api.openalex.org/works/{id}"
    try:
        response = requests.get(url).json()
        time.sleep(0.01)
        authorships = response['authorships']
        author_ids = extract_author_ids(authorships)
        df = pd.concat([df, pd.DataFrame({'id': [id], 'author_ids': [author_ids]})], ignore_index=True)
        print(f'{id} done')
    except:
        print(f'{id} skipped')

In [None]:
with open('truncated_authorships.pkl','wb') as file:
    pickle.dump(df, file)

In [None]:
# truncated_works['fixed'] = 0
count = 0
# truncated_works_fixed = pd.DataFrame()
while len(truncated_works)>0:
    left = len(truncated_works)
    for i in truncated_works.index:
        id = truncated_works.loc[i, 'id'].split('/')[-1]
        url = f"https://api.openalex.org/works/{id}"
        try:
            response = requests.get(url).json()
            time.sleep(0.01)
            authorships = response['authorships']
            authors_count = len(authorships)
            author_ids = extract_author_ids(authorships)
            truncated_works.at[i, 'author_ids'] = author_ids
            truncated_works.at[i, 'authors_count'] = authors_count
            truncated_works.at[i, 'fixed'] = 1
            print(f'{id} done ({i}/{left})')
        except:
            print(f'{id} skipped')
    truncated_works_fixed = pd.concat([truncated_works_fixed, truncated_works[truncated_works['fixed']==1]])
    truncated_works = truncated_works[truncated_works['fixed']==0]
    print(f'cycle finished: {len(truncated_works)} left')
    if len(truncated_works) == left:
        count += 1
        if count == 2:
            with open('truncated_works_unfixed.pkl', 'wb') as file:
                pickle.dump(truncated_works, file)
            break
            
with open('truncated_works_fixed.pkl', 'wb') as file:
    pickle.dump(truncated_works_fixed, file)

In [None]:
test_id = truncated_works.iloc[0]['id'].split('/')[-1]
url = f"https://api.openalex.org/works/{test_id}"
response = requests.get(url)

In [None]:
with open('af_authors_0725.pkl', 'rb') as file:
    af_authors = pickle.load(file)

In [None]:
truncated_works_fixed['has_af_authors'] = truncated_works_fixed['author_ids'].apply(lambda x: any([id in af_authors for id in x]))

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def check_authors(author_ids, af_authors_set, chunk_size=1000):
    return np.array([any(author_id in af_authors_set for author_id in ids) for ids in author_ids])

def process_dataframe(df, af_authors, chunk_size=1000):
    # Convert af_authors to a set for faster lookup
    af_authors_set = set(af_authors)
    
    # Convert 'author_ids' to lists if it's not already
    if df['author_ids'].dtype != 'object':
        df['author_ids'] = df['author_ids'].apply(lambda x: x if isinstance(x, list) else eval(x))
    
    # Initialize the result column
    df['has_af_authors'] = False
    
    # Process the dataframe in chunks
    total_chunks = (len(df) + chunk_size - 1) // chunk_size
    
    with tqdm(total=total_chunks, desc="Processing chunks") as pbar:
        for i in range(0, len(df), chunk_size):
            chunk = df.iloc[i:i+chunk_size]
            chunk_result = check_authors(chunk['author_ids'], af_authors_set)
            df.loc[chunk.index, 'has_af_authors'] = chunk_result
            pbar.update(1)
    
    return df

# Usage
# truncated_works_fixed = process_dataframe(truncated_works_fixed, af_authors)

In [None]:
truncated_works_fixed = process_dataframe(truncated_works_fixed, af_authors)

In [None]:
truncated_works_has_af_authors = truncated_works_fixed[truncated_works_fixed['has_af_authors']]

In [None]:
af_authors_set = set(af_authors)
truncated_works_has_af_authors['af_authors'] = truncated_works_has_af_authors['author_ids'].apply(lambda x: [id for id in x if id in af_authors_set])

In [None]:
truncated_works_has_af_authors['af_authors_count'] = truncated_works_has_af_authors['af_authors'].apply(lambda x: len(x))

In [None]:
af_authors_with_truncated_works = truncated_works_has_af_authors['af_authors'].sum()

In [None]:
print(len(af_authors_with_truncated_works))
af_authors_with_truncated_works = list(set(af_authors_with_truncated_works))
print(len(af_authors_with_truncated_works))

In [None]:
af_truncated_works_dict = {}
for author_id in af_authors_with_truncated_works:
    af_truncated_works_dict[author_id] = truncated_works_has_af_authors[truncated_works_has_af_authors['af_authors'].apply(lambda x: author_id in x)]

with open('af_truncated_works_dict.pkl', 'wb') as file:
    pickle.dump(af_truncated_works_dict, file)

In [None]:
with open('journal_list.pkl', 'rb') as file:
    journal_list = pickle.load(file)

In [None]:
# I can filter using multiple ORCIDs by using the '|' operator. (Up to 50 values)
def build_url(id_list):
    # specify endpoint
    endpoint = 'sources'

    ids = '|'.join(id_list)

    # put the URL together
    return f'https://api.openalex.org/{endpoint}?filter=openalex:{ids}&per-page=50'


In [None]:
def get_journal_info(id_longlist):
    # split the list into chunks of 50
    chunks = len(id_longlist)//50
    count = 0
    id_chunks = [id_longlist[i:i + 50] for i in range(0, len(id_longlist), 50)]
    df = pd.DataFrame()

    # get the URL
    for id_list in id_chunks:
        url = build_url(id_list)
        response = requests.get(url).json()
        data = pd.DataFrame(response['results'])
        df = pd.concat([df, data])
        count += 1
        print(f'{count}/{chunks}')

    return df

In [None]:
journal_df = get_journal_info(journal_list)

In [None]:
with open('journal_df.pkl', 'wb') as file:
    pickle.dump(journal_df, file)

In [None]:
journal_df = journal_df[['id', 'issn_l', 'issn', 'display_name', 'type', 'is_oa', 'apc_usd', 'country_code']]

In [None]:
journal_df['type'].unique()

In [None]:
journal_df[journal_df['issn_l'].notnull()]

In [None]:
scie = pd.read_csv('scie.csv')

In [None]:
scie_issns = scie['ISSN'].tolist() + scie['eISSN'].tolist()

In [None]:
journal_df[journal_df['issn_l'].isin(scie_issns)]

In [None]:
scie['issn'] = scie[['ISSN', 'eISSN']].apply(lambda x: x[0] if pd.notnull(x[0]) else x[1], axis=1)

In [None]:
scie['issn']

In [None]:
esci = pd.read_csv('esci.csv')

In [None]:
esci['issn'] = esci[['ISSN', 'eISSN']].apply(lambda x: x[0] if pd.notnull(x[0]) else x[1], axis=1)

In [None]:
issn_list = list(set(scie['issn'].tolist() + esci['issn'].tolist()))

In [None]:
# save issn_list to text files with 600 issns each. ISSNs should be separated by semicolons.
issn_chunks = [issn_list[i:i + 600] for i in range(0, len(issn_list), 600)]

for i, chunk in enumerate(issn_chunks):
    with open(f'issn_chunks/issn_chunk_{i}.txt', 'w') as file:
        file.write(';'.join(chunk))

In [None]:
import io
import csv

In [None]:
def custom_csv_reader(file_path, skip_rows=2, skip_footer=3):
    with open(file_path, 'r') as file:
        # Skip initial rows
        for _ in range(skip_rows):
            next(file)
        
        # Read the rest of the file
        lines = file.readlines()
        
        # Remove footer
        lines = lines[:-skip_footer] if skip_footer else lines
        
        # Remove trailing comma from data rows (not from header)
        header = lines[0].strip()+'\n'
        data_rows = [line.rstrip().rstrip(',')+'\n' for line in lines[1:]]
        
        # Combine header and cleaned data rows
        cleaned_csv = io.StringIO(header + ''.join(data_rows))
        
        # Read the cleaned CSV with pandas
        df = pd.read_csv(cleaned_csv)
    
    return df

In [None]:
scie_esci = pd.DataFrame()
for i in range(1,31):
    df = custom_csv_reader(f'jcr_chunks/KyuhunLee_JCR_JournalResults_08_2024-{i}.csv', skip_rows=2, skip_footer=2)
    scie_esci = pd.concat([scie_esci, df])

In [None]:
downloaded_issn_list = scie_esci['ISSN'].tolist() + scie_esci['eISSN'].tolist()

In [None]:
not_downloaded_issn_list = list(set(issn_list) - set(downloaded_issn_list))

In [None]:
scie_left = scie[(scie['issn'].isin(not_downloaded_issn_list)) | (scie['eISSN'].isin(not_downloaded_issn_list))]
esci_left = esci[(esci['issn'].isin(not_downloaded_issn_list)) | (esci['eISSN'].isin(not_downloaded_issn_list))]

In [None]:
new_issn_list = list(set(scie_left['issn'].tolist() + scie_left['eISSN'].tolist() + esci_left['issn'].tolist() + esci_left['eISSN'].tolist()))

In [None]:
# remove nan value
new_issn_list = [issn for issn in new_issn_list if pd.notnull(issn)]

In [None]:
# save new_issn_list to text files with 600 issns each. ISSNs should be separated by semicolons.
new_issn_chunks = [new_issn_list[i:i + 600] for i in range(0, len(new_issn_list), 600)]

for i, chunk in enumerate(new_issn_chunks):
    with open(f'issn_chunks/new_issn_chunk_{i}.txt', 'w') as file:
        file.write(';'.join(chunk))

In [None]:
for i in range(1,5):
    df = custom_csv_reader(f'jcr_chunks/KyuhunLee_JCR_JournalResults_08_2024-{i}n.csv', skip_rows=2, skip_footer=2)
    scie_esci = pd.concat([scie_esci, df])

In [None]:
with open('jcr.pkl', 'wb') as file:
    pickle.dump(scie_esci, file)