In [7]:
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
import pyalex
import pandas as pd 
from tqdm import tqdm

pyalex.config.email = "hi387@nyu.edu"

In [8]:
def clean_authorships(authorships):
    return len(authorships)

def clean_concepts(concepts):
    for concept in concepts:
        if concept['level'] == 0:
            return concept['display_name']
    
    return concepts[0]['display_name']


def clean_counts_by_year(publication_year, counts_by_year):
    year_1 = [x['cited_by_count'] for x in counts_by_year if x['year'] <= publication_year + 1]
    year_5 = [x['cited_by_count'] for x in counts_by_year if x['year'] <= publication_year + 5]
    year_10 = [x['cited_by_count'] for x in counts_by_year if x['year'] <= publication_year + 10]
    year_25 = [x['cited_by_count'] for x in counts_by_year if x['year'] <= publication_year + 25]
    year_50 = [x['cited_by_count'] for x in counts_by_year if x['year'] <= publication_year + 50]
    
    return [sum(year_1), sum(year_5), sum(year_10), sum(year_25), sum(year_50)]

def clean_entry(entry):
    id = entry['id']
    publication_year = entry['publication_year']
    years_since_publication = 2023 - publication_year
    authorships = entry['authorships']
    authorships = clean_authorships(authorships)
    countries_distinct_count = entry['countries_distinct_count']
    institutions_distinct_count = entry['institutions_distinct_count']
    concepts = entry['concepts']
    concepts = clean_concepts(concepts)
    counts_by_year = entry['counts_by_year']
    counts_by_year = clean_counts_by_year(publication_year, counts_by_year)
    
    year_1 = counts_by_year[0]
    year_5 = counts_by_year[1]
    year_10 = counts_by_year[2]
    year_25 = counts_by_year[3]
    year_50 = counts_by_year[4]
    
    total_citations = entry['cited_by_count']
    
    return [id, publication_year, years_since_publication, authorships, countries_distinct_count, institutions_distinct_count, concepts, year_1, year_5, year_10, year_25, year_50, total_citations]

In [10]:
# df = pd.DataFrame(columns = ['id', 'pub_year', 'years_since_pub', 'authorships', 'countries_distinct_count', 'institutions_distinct_count', 'concepts', 'year_1', 'year_5', 'year_10', 'year_25', 'year_50', 'total_citations'])

df = pd.read_csv('./data.csv')
for i in tqdm(range(7000)):
    try:
        works = Works().sample(100).get()
        data = []
        for work in works:
            data.append(clean_entry(work))
            
        temp_df = pd.DataFrame(data, columns = ['id', 'pub_year', 'years_since_pub', 'authorships', 'countries_distinct_count', 'institutions_distinct_count', 'concepts', 'year_1', 'year_5', 'year_10', 'year_25', 'year_50', 'total_citations'])
        df = pd.concat([df, temp_df])
    except:
        pass
    
    if i % 100 == 0:
        df.to_csv('data.csv', index=False)

df.to_csv('data.csv', index=False)

100%|██████████| 7000/7000 [4:16:09<00:00,  2.20s/it]   


In [13]:
df = pd.read_csv('./data.csv')
df.head()

Unnamed: 0,id,pub_year,years_since_pub,authorships,countries_distinct_count,institutions_distinct_count,concepts,year_1,year_5,year_10,year_25,year_50,total_citations
0,https://openalex.org/W153152856,2007,16,10,0,0,Medicine,0,0,0,0,0,0
1,https://openalex.org/W2418723612,2002,21,2,1,1,Medicine,0,0,0,0,0,0
2,https://openalex.org/W2092897175,2010,13,9,1,1,Medicine,0,9,18,22,22,22
3,https://openalex.org/W2620772774,2013,10,4,2,2,Medicine,0,0,0,0,0,0
4,https://openalex.org/W2064953989,1926,97,1,0,0,Art,0,0,0,0,0,2


In [14]:
print(len(df))
dropped = df.drop_duplicates(subset = ['id'], keep = 'first')
print(len(dropped))

566667
565973


In [16]:
len(dropped.loc[(dropped['pub_year'] >= 1945) & (dropped['authorships'] >= 1)])

501834

In [4]:
df = pd.read_csv('./data.csv')
df.head()

Unnamed: 0,id,pub_year,years_since_pub,authorships,countries_distinct_count,institutions_distinct_count,concepts,year_1,year_5,year_10,year_25,year_50,total_citations
0,https://openalex.org/W153152856,2007,16,10,0,0,Medicine,0,0,0,0,0,0
1,https://openalex.org/W2418723612,2002,21,2,1,1,Medicine,0,0,0,0,0,0
2,https://openalex.org/W2092897175,2010,13,9,1,1,Medicine,0,9,18,22,22,22
3,https://openalex.org/W2620772774,2013,10,4,2,2,Medicine,0,0,0,0,0,0
4,https://openalex.org/W2064953989,1926,97,1,0,0,Art,0,0,0,0,0,2


In [6]:
w = Works()["W153152856"]
w['authorships']

[{'author_position': 'first',
  'author': {'id': 'https://openalex.org/A5029506025',
   'display_name': '健二 桧垣',
   'orcid': None},
  'institutions': [],
  'countries': [],
  'is_corresponding': False,
  'raw_author_name': '健二 桧垣',
  'raw_affiliation_string': '',
  'raw_affiliation_strings': []},
 {'author_position': 'middle',
  'author': {'id': 'https://openalex.org/A5050466832',
   'display_name': '亮子 小野',
   'orcid': None},
  'institutions': [],
  'countries': [],
  'is_corresponding': False,
  'raw_author_name': '亮子 小野',
  'raw_affiliation_string': '',
  'raw_affiliation_strings': []},
 {'author_position': 'middle',
  'author': {'id': 'https://openalex.org/A5089954329',
   'display_name': '彰一郎 大谷',
   'orcid': None},
  'institutions': [],
  'countries': [],
  'is_corresponding': False,
  'raw_author_name': '彰一郎 大谷',
  'raw_affiliation_string': '',
  'raw_affiliation_strings': []},
 {'author_position': 'middle',
  'author': {'id': 'https://openalex.org/A5011472373',
   'display_name