In [None]:
import matplotlib.pyplot as plt 
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import powerlaw
import scipy

In [None]:
data_path = '/scratch/network/science-of-science/data/openalex-202201'

In [None]:
def read_prepared_data(path: str, source: str, cols: list, types: dict) -> pd.DataFrame:
    dfs = []
    for df_small in pd.read_csv(f"{path}/{source}.txt.gz", chunksize=10000, usecols=cols, sep='\t', compression='gzip', dtype=types, on_bad_lines='skip', low_memory=False):
        dfs.append(df_small)
    return pd.concat(dfs)
    
def fitting_procedure_yearly(year: int, col: pd.Series):
    fit = powerlaw.Fit(col, discrete=True, fit_method='KS')
    k_min_fit = round(fit.find_xmin(), 1)
    gamma = round(fit.alpha, 1)
    fig = fit.plot_pdf(linewidth=3, label=f'Empirical Data: Year={year}, Gamma={gamma}')
    fit.power_law.plot_pdf(ax=fig, linestyle='--', label='Power law fit')
    handles, labels = fig.get_legend_handles_labels()
    fig.legend(handles, labels, loc=3)
    fig.set_ylabel("p(X)")
    fig.set_xlabel(r"Citations (X)")
    fig.set_title(r"Probability Density Function of Citation Frequency for Cited Journal Papers Published this Year")
    return k_min_fit, gamma

def fitting_procedure_total(col: pd.Series, color):
    fit = powerlaw.Fit(col, discrete=True, fit_method='KS')
    k_min_fit = round(fit.find_xmin(), 1)
    gamma = round(fit.alpha, 1)
    fig = fit.plot_pdf(linewidth=3, color=color)
    fit.power_law.plot_pdf(ax=fig, linestyle='--', label='Power law fit', color=color)
    fig.set_ylabel("p(X)")
    fig.set_xlabel(r"Citations (X)")
    fig.set_title(r"Probability Density Function of Citation Frequency for Cited Journal Papers Published 1930 to 2022")
    return k_min_fit, gamma

In [None]:
cited_ids_df = read_prepared_data(data_path, "PaperReferences", ['PaperReferenceId'], {"PaperReferenceId":'Int64'}).rename(columns={"PaperReferenceId": "PaperId"})
papers_df = read_prepared_data(data_path, "Papers", ['PaperId', 'Year', 'JournalId'], {"PaperId": str, "Year": str, "JournalId": str})

In [None]:
papers_df['PaperId'] = pd.to_numeric(papers_df['PaperId'], errors='coerce', downcast='integer')
papers_df['Year'] = pd.to_numeric(papers_df['Year'], errors='coerce', downcast='integer')
papers_df['JournalId'] = pd.to_numeric(papers_df['Year'], errors='coerce', downcast='integer')
papers_df['PaperId'] = papers_df['PaperId'].astype('Int64')
papers_df['Year'] = papers_df['Year'].astype('Int64')
papers_df['JournalId'] = papers_df['Year'].astype('Int64')
papers_without_nulls_df = papers_df.dropna()
papers_1930_to_2022_df = papers_df[(papers_df['Year'] > 1929) & (papers_df['Year'] < 2023)] 

In [None]:
cited_ids_df['CitationCount'] = cited_ids_df.groupby('PaperId').transform('size')
cited_ids_df = cited_ids_df.drop_duplicates(subset=['PaperId'])
cited_ids_df = cited_ids_df.set_index('PaperId')

In [None]:
cited_ids_and_years_df = pd.merge(cited_ids_df, papers_1930_to_2022_df, on='PaperId')
cited_ids_and_years_without_paperid_df = cited_ids_and_years_df.drop(columns=['PaperId'])
cited_papers_and_journals_df = cited_ids_and_years_without_paperid_df.groupby(by=["JournalId", "Year"], as_index=False).agg({'CitationCount':'sum'})

In [None]:
# make dataframes for citation counts of papers published in each year 
cited_papers_df = cited_papers_and_journals_df.dropna(subset='Year')

unique_years_cited_papers = sorted(cited_papers_df.Year.unique())

cited_papers_dict = {elem : pd.DataFrame() for elem in unique_years_cited_papers}

for key in cited_papers_dict.keys():
    cited_papers_dict[key] = cited_papers_df[:][cited_papers_df.Year == key]

for key, df in cited_papers_dict.items():
    new_df = df.drop(columns=['Year'])
    cited_papers_dict.update({key: new_df})

In [None]:
x = []
y = []
for key,df in cited_papers_dict.items():
    k_min, gamma = fitting_procedure_yearly(key, df['CitationCount'])
    x.append(key)
    y.append(gamma)
    print(f"{key}, {gamma}")
    plt.show()

In [None]:
num_colors = len(cited_papers_dict)
colors = cm.RdBu(np.linspace(0, 1, num_colors))

for i, (key, df) in enumerate(cited_papers_dict.items()):
    k_min, gamma = fitting_procedure_total(df['CitationCount'], colors[i])
    print(f"{key}, {gamma}")
    
plt.show()

In [None]:
x = np.asarray(x)
y = np.asarray(y)
res = scipy.stats.linregress(x, y)
plt.plot(x, y, 'o', label='results (year, gamma)')
plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
plt.title('Change in Gamma (1930 to 2022)')
plt.xlabel('Year')
plt.ylabel('Gamma')
plt.legend()
plt.show()
print("Slope:", res.slope)