In [None]:
import matplotlib.pyplot as plt 
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import powerlaw

In [None]:
data_path = '/scratch/network/science-of-science/data/openalex-202201'

In [None]:
def read_prepared_data(path: str, source: str, cols: list, types: dict) -> pd.DataFrame:
    dfs = []
    for df_small in pd.read_csv(f"{path}/{source}.txt.gz", chunksize=10000, usecols=cols, sep='\t', compression='gzip', dtype=types, on_bad_lines='skip', low_memory=False):
        dfs.append(df_small)
    return pd.concat(dfs)
    
def fitting_procedure_yearly(year: int, col: pd.Series):
    fit = powerlaw.Fit(col, discrete=True, fit_method='KS')
    gamma = round(fit.alpha, 1)
    fig = fit.plot_pdf(linewidth=3, label=f'Empirical Data: Year={year}, Gamma={gamma}')
    fit.power_law.plot_pdf(ax=fig, linestyle='--', label='Power law fit')
    handles, labels = fig.get_legend_handles_labels()
    fig.legend(handles, labels, loc=3)
    fig.set_ylabel("p(X)")
    fig.set_xlabel(r"Citations (X)")
    fig.set_title(r"Probability Density Function of Citation Frequency for Journal Papers Cited this Year")
    return gamma

def fitting_procedure_total(col: pd.Series, color):
    fit = powerlaw.Fit(col, discrete=True, fit_method='KS')
    gamma = round(fit.alpha, 1)
    fig = fit.plot_pdf(linewidth=3, color=color)
    fit.power_law.plot_pdf(ax=fig, linestyle='--', label='Power law fit', color=color)
    fig.set_ylabel("p(X)")
    fig.set_xlabel(r"Citations (X)")
    fig.set_title(r"Probability Density Function of Citation Frequency for Journal Papers Cited by Year (2012 to 2017)")
    return gamma

In [None]:
cited_ids_df = read_prepared_data(data_path, "PaperReferences", ['PaperId', 'PaperReferenceId'], {"PaperId":'Int64', "PaperReferenceId":'Int64'})
papers_df = read_prepared_data(data_path, "Papers", ['PaperId', 'Year', 'JournalId'], {"PaperId": str, "Year": str, "JournalId": str})

In [None]:
papers_df['PaperId'] = pd.to_numeric(papers_df['PaperId'], errors='coerce', downcast='integer')
papers_df['Year'] = pd.to_numeric(papers_df['Year'], errors='coerce', downcast='integer')
papers_df['JournalId'] = pd.to_numeric(papers_df['JournalId'], errors='coerce', downcast='integer')
papers_df['PaperId'] = papers_df['PaperId'].astype('Int64')
papers_df['Year'] = papers_df['Year'].astype('Int64')
papers_df['JournalId'] = papers_df['JournalId'].astype('Int64')
papers_without_nulls_df = papers_df.dropna()

In [None]:
cited_ids_df = cited_ids_df.set_index('PaperId')
papers_2012_to_2017_df = papers_without_nulls_df[(papers_without_nulls_df['Year'] > 2011) & (papers_without_nulls_df['Year'] < 2018)]
cited_ids_and_years_df = pd.merge(cited_ids_df, papers_2012_to_2017_df , on='PaperId')
cited_ids_and_years_df['CitationCount'] = cited_ids_and_years_df.groupby(['PaperReferenceId', 'Year', 'JournalId']).transform('size')
cited_ids_and_years_df = cited_ids_and_years_df.drop(columns=['PaperId']).drop_duplicates() # drop duplicates to have accurate citation count of journal
cited_ids_and_years_df = cited_ids_and_years_df.drop(columns=['PaperReferenceId'])
cited_papers_and_journals_df = cited_ids_and_years_df.groupby(by=["JournalId", "Year"], as_index=False).agg({'CitationCount':'sum'})

In [None]:
unique_years_cited_papers = sorted(cited_papers_and_journals_df.Year.unique())

cited_papers_dict = {elem : pd.DataFrame() for elem in unique_years_cited_papers}

for key in cited_papers_dict.keys():
    cited_papers_dict[key] = cited_papers_and_journals_df[:][cited_papers_and_journals_df.Year == key]

for key, df in cited_papers_dict.items():
    new_df = df.drop(columns=['Year'])
    cited_papers_dict.update({key: new_df})

In [None]:
for key,df in cited_papers_dict.items():
    gamma = fitting_procedure_yearly(key, df['CitationCount'])
    print(f"{key}, {gamma}")
    plt.show()

In [None]:
num_colors = len(cited_papers_dict)
colors = cm.RdBu(np.linspace(0, 1, num_colors))

for i, (key, df) in enumerate(cited_papers_dict.items()):
    gamma = fitting_procedure_total(df['CitationCount'], colors[i])
    print(f"{key}, {gamma}")
    
plt.show()