In [None]:
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
import powerlaw
import scipy

In [None]:
data_path = '/scratch/network/science-of-science/data/openalex-202201'

In [None]:
def read_prepared_data(path: str, source: str, cols: list, types: dict) -> pd.DataFrame:
    dfs = []
    for df_small in pd.read_csv(f"{path}/{source}.txt.gz", chunksize=10000, usecols=cols, sep='\t', compression='gzip', dtype=types, on_bad_lines='skip', low_memory=False):
        dfs.append(df_small)
    return pd.concat(dfs)
    
def fitting_procedure_yearly(year: int, col: pd.Series):
    fit = powerlaw.Fit(col, discrete=True, fit_method='KS')
    k_min_fit = round(fit.find_xmin(), 1)
    gamma = round(fit.alpha, 1)
    fig = fit.plot_pdf(linewidth=3, label=f'Empirical Data: Year={year}, Gamma={gamma}')
    fit.power_law.plot_pdf(ax=fig, linestyle='--', label='Power law fit')
    handles, labels = fig.get_legend_handles_labels()
    fig.legend(handles, labels, loc=3)
    fig.set_ylabel("p(X)")
    fig.set_xlabel(r"Citation Frequency")
    fig.set_title(r"Probability Density Function of Citation Frequency for Cited Papers Published this Year in Journals")
    return k_min_fit, gamma

def fitting_procedure_total(col: pd.Series):
    fit = powerlaw.Fit(col, discrete=True, fit_method='KS')
    k_min_fit = round(fit.find_xmin(), 1)
    gamma = round(fit.alpha, 1)
    fig = fit.plot_pdf(linewidth=3)
    fit.power_law.plot_pdf(ax=fig, linestyle='--', label='Power law fit')
    fig.set_ylabel("p(X)")
    fig.set_xlabel(r"Citation Frequency")
    fig.set_title(r"Probability Density Function of Citation Frequency for Cited Papers Published in Journals from 1800 to 2022")
    return k_min_fit, gamma

In [None]:
papers_df = read_prepared_data(data_path, "Papers", ['Year', 'JournalId', 'CitationCount'], {"Year": str, "JournalId": str, "CitationCount": str})

In [None]:
papers_df['CitationCount'] = pd.to_numeric(papers_df['CitationCount'], errors='coerce', downcast='integer')
papers_df['Year'] = pd.to_numeric(papers_df['Year'], errors='coerce', downcast='integer')
papers_df['JournalId'] = pd.to_numeric(papers_df['JournalId'], errors='coerce', downcast='integer')
papers_df['CitationCount'] = papers_df['CitationCount'].astype('Int64')
papers_df['Year'] = papers_df['Year'].astype('Int64')
papers_df['JournalId'] = papers_df['JournalId'].astype('Int64')

In [None]:
papers_without_nulls_df = papers_df.dropna()

In [None]:
papers_without_nulls_and_zeros_df = papers_without_nulls_df[papers_without_nulls_df['CitationCount'] != 0]

In [None]:
cited_papers_and_journals_df = papers_without_nulls_and_zeros_df.groupby(by=["JournalId", "Year"], as_index=False).agg({'CitationCount':'sum'})

In [None]:
unique_years_cited_papers = sorted(cited_papers_and_journals_df.Year.unique())

cited_papers_dict = {elem : pd.DataFrame() for elem in unique_years_cited_papers}

for key in cited_papers_dict.keys():
    cited_papers_dict[key] = cited_papers_and_journals_df[:][cited_papers_and_journals_df.Year == key]

for key, df in cited_papers_dict.items():
    new_df = df.drop(columns=['Year'])
    cited_papers_dict.update({key: new_df})

In [None]:
x = []
y = []
for key,df in cited_papers_dict.items():
    k_min, gamma = fitting_procedure_yearly(key, df['CitationCount'])
    if key > 1889:
        x.append(key)
        y.append(gamma)
    print(f"{key}, {gamma}")
    plt.show()

In [None]:
for key,df in cited_papers_dict.items():
    k_min, gamma = fitting_procedure_total(df['CitationCount'])
    print(f"{key}, {gamma}")

In [None]:
x = np.asarray(x)
y = np.asarray(y)
res = scipy.stats.linregress(x, y)
plt.plot(x, y, 'o', label='results (year, gamma)')
plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
plt.title('Change in Gamma (1890 to 2022)')
plt.xlabel('Year')
plt.ylabel('Gamma')
plt.legend()
plt.show()
print("Slope:", res.slope)