In [None]:
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
import powerlaw
import scipy

In [None]:
data_path = '/scratch/network/science-of-science/data/openalex-202201'

In [None]:
def read_prepared_data(path: str, source: str, cols: list, types: dict) -> pd.DataFrame:
    dfs = []
    for df_small in pd.read_csv(f"{path}/{source}.txt.gz", chunksize=10000, usecols=cols, sep='\t', compression='gzip', dtype=types, on_bad_lines='skip', low_memory=False):
        dfs.append(df_small)
    return pd.concat(dfs)
    
def fitting_procedure_yearly(year: int, col: pd.Series):
    fit = powerlaw.Fit(col, discrete=True, fit_method='KS')
    k_min_fit = round(fit.find_xmin(), 1)
    gamma = round(fit.alpha, 1)
    fig = fit.plot_pdf(linewidth=3, label=f'Empirical Data: Year={year}, Gamma={gamma}')
    fit.power_law.plot_pdf(ax=fig, linestyle='--', label='Power law fit')
    handles, labels = fig.get_legend_handles_labels()
    fig.legend(handles, labels, loc=3)
    fig.set_ylabel("p(X)")
    fig.set_xlabel(r"Citation Frequency")
    fig.set_title(r"Probability Density Function of Citation Frequency for Papers Cited this Year in Journals")
    return k_min_fit, gamma

def fitting_procedure_total(col: pd.Series):
    fit = powerlaw.Fit(col, discrete=True, fit_method='KS')
    k_min_fit = round(fit.find_xmin(), 1)
    gamma = round(fit.alpha, 1)
    fig = fit.plot_pdf(linewidth=3)
    fit.power_law.plot_pdf(ax=fig, linestyle='--', label='Power law fit')
    fig.set_ylabel("p(X)")
    fig.set_xlabel(r"Citation Frequency")
    fig.set_title(r"Probability Density Function of Citation Frequency for Papers Cited this Year in Journals from 1819 to 1999")
    return k_min_fit, gamma

In [None]:
cited_ids_df = read_prepared_data(data_path, "PaperReferences", ['PaperId', 'PaperReferenceId'], {"PaperId":'Int64', "PaperReferenceId":'Int64'})
papers_df = read_prepared_data(data_path, "Papers", ['PaperId', 'Year', 'JournalId'], {"PaperId": str, "Year": str, "JournalId": str})

In [None]:
papers_df['PaperId'] = pd.to_numeric(papers_df['PaperId'], errors='coerce', downcast='integer')
papers_df['Year'] = pd.to_numeric(papers_df['Year'], errors='coerce', downcast='integer')
papers_df['JournalId'] = pd.to_numeric(papers_df['JournalId'], errors='coerce', downcast='integer')
papers_df['PaperId'] = papers_df['PaperId'].astype('Int64')
papers_df['Year'] = papers_df['Year'].astype('Int64')
papers_df['JournalId'] = papers_df['JournalId'].astype('Int64')
cited_ids_df = cited_ids_df.set_index('PaperId')

In [None]:
papers_without_nulls_df = papers_df.dropna()

In [None]:
papers_1800s_1900s_df = papers_without_nulls_df[papers_without_nulls_df['Year'] < 2000] 

In [None]:
cited_ids_and_years_df = pd.merge(cited_ids_df, papers_1800s_1900s_df, on='PaperId')

In [None]:
cited_ids_and_years_df['CitationCount'] = cited_ids_and_years_df.groupby(['PaperReferenceId', 'Year', 'JournalId']).transform('size')

In [None]:
cited_ids_and_years_df = cited_ids_and_years_df.drop(columns=['PaperId']).drop_duplicates()

In [None]:
cited_ids_and_years_df = cited_ids_and_years_df.drop(columns=['PaperReferenceId'])

In [None]:
cited_papers_and_journals_df = cited_ids_and_years_df.groupby(by=["JournalId", "Year"], as_index=False).agg({'CitationCount':'sum'})

In [None]:
unique_years_cited_papers = sorted(cited_papers_and_journals_df.Year.unique())
years_to_remove = [1802, 1803, 1805, 1806, 1807, 1809, 1810, 1811, 1813, 1814, 1815, 1816, 1820, 1821, 1823, 1824, 1825, 1826, 1829, 1831, 1837, 1838, 1839, 1843]
for year in years_to_remove:
    unique_years_cited_papers.remove(year)

cited_papers_dict = {elem : pd.DataFrame() for elem in unique_years_cited_papers}

for key in cited_papers_dict.keys():
    cited_papers_dict[key] = cited_papers_and_journals_df[:][cited_papers_and_journals_df.Year == key]

for key, df in cited_papers_dict.items():
    new_df = df.drop(columns=['Year'])
    cited_papers_dict.update({key: new_df})

In [None]:
for key,df in cited_papers_dict.items():
    k_min, gamma = fitting_procedure_yearly(key, df['CitationCount'])
    print(f"{key}, {gamma}")
    plt.show()

In [None]:
for key,df in cited_papers_dict.items():
    k_min, gamma = fitting_procedure_total(df['CitationCount'])
    print(f"{key}, {gamma}")