In [None]:
import matplotlib.pyplot as plt 
import pandas as pd

In [None]:
data_path = '/scratch/network/science-of-science/data/openalex-202201'

In [None]:
def read_prepared_data(path: str, source: str, cols: list, types: dict) -> pd.DataFrame:
    dfs = []
    for df_small in pd.read_csv(f"{path}/{source}.txt.gz", chunksize=10000, usecols=cols, sep='\t', compression='gzip', dtype=types, on_bad_lines='skip', low_memory=False):
        dfs.append(df_small)
    return pd.concat(dfs)

In [None]:
papers_df = read_prepared_data(data_path, "Papers", ['PaperId', 'Year'], {"PaperId": str, "Year": str})

In [None]:
papers_df['PaperId'] = pd.to_numeric(papers_df['PaperId'], errors='coerce', downcast='integer')
papers_df['Year'] = pd.to_numeric(papers_df['Year'], errors='coerce', downcast='integer')
papers_df['PaperId'] = papers_df['PaperId'].astype('Int64')
papers_df['Year'] = papers_df['Year'].astype('Int64')
papers_without_nulls_df = papers_df.dropna()

In [None]:
# check range of years 
papers_without_nulls_df['PapersCount'] = papers_without_nulls_df.groupby('Year').transform('size')
sorted_papers_df = papers_without_nulls_df.sort_values(by=['Year'])
sorted_papers_within_reasonable_years_df = sorted_papers_df[(sorted_papers_df['Year']>=1800) & (sorted_papers_df['Year']<=2024)]
sorted_papers_within_reasonable_years_df

In [None]:
sorted_papers_within_reasonable_years_df.plot.scatter(x='Year', y='PapersCount', color='green')
plt.title('Number of Papers Published from 1800 to 2024')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.show()

In [None]:
sufficient_num_papers_per_year_df = sorted_papers_within_reasonable_years_df[sorted_papers_within_reasonable_years_df['PapersCount']>=100000]
sorted_sufficient_num_papers_per_year_df = sorted_papers_within_reasonable_years_df.sort_values(by=['Year'])
sorted_sufficient_num_papers_per_year_df # count citations of papers published/cited between 1930 and 2022

In [None]:
sorted_sufficient_num_papers_per_year_df.plot.scatter(x='Year', y='PapersCount', color='green')
plt.title('Number of Papers Published from 1930 to 2022')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.show()