In [1]:
import glob, ast, dispindiffs
import polars as pl
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
abdc = pd.read_excel("./dat/ABDC-JQL-2022-v3-100523.xlsx", sheet_name="2022 JQL", header=8) # https://abdc.edu.au/abdc-journal-quality-list/ - "Export full list"
abdc["2022 rating"] = abdc["2022 rating"].str.strip()
abdc["ISSN"] = abdc["ISSN"].str.replace("\t", "").str.strip()
abdc["ISSN Online"] = abdc["ISSN Online"].str.replace("\t", "").str.strip()
abdc = abdc[~(abdc["ISSN"].isna() & abdc["ISSN Online"].isna())].reset_index(drop=True).drop_duplicates()

abdc_issn_to_rating = {}
abdc_issn_to_for = {}
for ix, row in abdc.iterrows():
    if pd.notna(row["ISSN"]) & (row["ISSN"]!=""):
        abdc_issn_to_rating[row["ISSN"][:9]] = row["2022 rating"]
        abdc_issn_to_for[row["ISSN"][:9]] = row["FoR"]
    elif pd.notna(row["ISSN Online"]) & (row["ISSN Online"]!=""):
        abdc_issn_to_rating[row["ISSN Online"][:9]] = row["2022 rating"]
        abdc_issn_to_for[row["ISSN Online"][:9]] = row["FoR"]
    else:
        continue
print(len(abdc_issn_to_rating))

2675


In [3]:
all_sources = pl.read_parquet("./dat/sciscinet_sources.parquet")
sourceid_to_name = dict(zip(all_sources["sourceid"], all_sources["display_name"]))

sourceid_to_FoR = {}
sourceid_to_rating = {}
for row in all_sources.iter_rows(named=True):
    if row["issn"]!="null":
        for issn in ast.literal_eval(row["issn"]):
            if issn in abdc_issn_to_rating:
                sourceid_to_FoR[row["sourceid"]] = abdc_issn_to_for[issn]
                sourceid_to_rating[row["sourceid"]] = abdc_issn_to_rating[issn]
abdc_sources = all_sources.filter(pl.col("sourceid").is_in(sourceid_to_FoR.keys()))
abdc_sources = abdc_sources.with_columns(
    pl.col("sourceid").replace_strict(sourceid_to_FoR, default="").alias("FoR"),
    pl.col("sourceid").replace_strict(sourceid_to_rating, default="").alias("rating"),
)
del all_sources

In [4]:
for_to_desc = {
    3501: "Accounting, auditing and accountability",
    3502: "Banking, finance and investment",
    3503: "Business systems in context",
    3504: "Commercial services",
    3505: "Human resources and industrial relations",
    3506: "Marketing",
    3507: "Strategy, management and organisational behaviour",
    3508: "Tourism",
    3509: "Transportation, logistics and supply chains",
    3599: "Other commerce, management, tourism and services",
    3801: "Applied economics",
    3802: "Econometrics",
    3803: "Economic theory",
    3899: "Other economics",
    4609: "Information systems",
    4801: "Commercial law",
    4905: "Statistics"
}


def abdc_eval_prep(elist: pl.DataFrame, source: str, target: str,
                   sourceid_to_name: dict,
                   sourceid_to_FoR: dict,
                   sourceid_to_rating: dict) -> pl.DataFrame:
    
    df = elist.clone()
    df = df.filter(df[source].is_in(sourceid_to_FoR) & df[target].is_in(sourceid_to_FoR))

    # Add mapped columns using Polars' `apply` method
    df = df.with_columns([        
        pl.col(source).replace_strict(sourceid_to_FoR, default=None).alias("source_FoR"),
        pl.col(target).replace_strict(sourceid_to_FoR, default=None).alias("target_FoR"),
        pl.col(source).replace_strict(sourceid_to_rating, default=None).alias("source_rating"),
        pl.col(target).replace_strict(sourceid_to_rating, default=None).alias("target_rating"),
        pl.col(source).replace_strict(sourceid_to_name, default=None).alias("source_name"),
        pl.col(target).replace_strict(sourceid_to_name, default=None).alias("target_name"),
    ])

    # Filter rows where source_FoR == target_FoR
    df = df.filter(df["source_FoR"] == df["target_FoR"])

    return df

In [5]:
elist = pl.read_parquet("./dat/final_jnl_elist.parquet")[["citing_sourceid", "cited_sourceid", "w_ij"]]
N_T = len(set(elist["citing_sourceid"].unique()) | set(elist["cited_sourceid"].unique()))
E_T = len(elist)
print(N_T, E_T)

90872 76512317


In [None]:
%%time

Journals = dispindiffs.DisparityInDifferences(elist, source="citing_sourceid", target="cited_sourceid", weight="w_ij")

Journals.calc_disp()
Journals.calc_disp_in_diffs()

Merging bilateral relations
Generating pre-sampled values from beta distributions
Calculating statistical significance


### Disparity Filter

In [None]:
n_nodes_edges_by_th = []
for th in tqdm([10**(-k) for k in np.arange(20, -0.1, -0.25)]):
    bb, th, N, E = Journals.extr_disp_backbone(th=th)    
    n_nodes_edges_by_th.append((th, N, E))
pd.DataFrame(n_nodes_edges_by_th, columns=["th", "n_nodes", "n_edges"]).to_csv("./outputs/journals_disp_info_by_th.csv", index=False)

In [None]:
disp_backbone, _, _, _ = Journals.extr_disp_backbone(th=0.01)

### Disparity-in-Differences

In [None]:
n_nodes_edges_by_th = []
for th in tqdm([10**(-k) for k in np.arange(20, -0.1, -0.25)]):
    bb, th, N, E = Journals.extr_disp_in_diffs_backbone(th=th)    
    n_nodes_edges_by_th.append((th, N, E))
pd.DataFrame(n_nodes_edges_by_th, columns=["th", "n_nodes", "n_edges"]).to_csv("./outputs/journals_disp_in_diffs_info_by_th.csv", index=False)

In [None]:
disp_in_diffs_backbone, _, _, _  = Journals.extr_disp_in_diffs_backbone(th=0.01)    

### Validation

In [None]:
disp_backbone = abdc_eval_prep(disp_backbone, source="source", target="target", 
                                   sourceid_to_name=sourceid_to_name, sourceid_to_FoR=sourceid_to_FoR, sourceid_to_rating=sourceid_to_rating).to_pandas()
disp_backbone["source_rating"] = pd.Categorical(disp_backbone["source_rating"], categories=["A*", "A", "B", "C"])
disp_backbone["target_rating"] = pd.Categorical(disp_backbone["target_rating"], categories=["A*", "A", "B", "C"])

In [None]:
eval_disparity = []
for f in sorted(disp_backbone["source_FoR"].unique()):    
    bb_disparity_for = disp_backbone[disp_backbone["source_FoR"]==f].reset_index(drop=True)

    n = len(set(bb_disparity_for["source"]) | set(bb_disparity_for["target"]))

    mat = pd.crosstab(bb_disparity_for["source_rating"], bb_disparity_for["target_rating"])
    #print(mat)

    mat_np = np.array(mat)
    e = np.triu(mat_np, k=1).sum()/mat_np.sum()
    diag = np.diag(mat_np).sum()/mat_np.sum()
    #print(mat.sum(), "/", e, "\n")

    eval_disparity.append((f, e, diag))    

In [None]:
eval_disparity = pd.DataFrame(eval_disparity, columns=["FoR", "misaligned_rate", "diag_rate"])
eval_disparity["misaligned_rate"].mean(), eval_disparity["diag_rate"].mean()

In [None]:
disp_in_diffs_backbone

In [None]:
disp_in_diffs_backbone = abdc_eval_prep(disp_in_diffs_backbone, source="source", target="target", 
                                      sourceid_to_name=sourceid_to_name, sourceid_to_FoR=sourceid_to_FoR, sourceid_to_rating=sourceid_to_rating).to_pandas()
disp_in_diffs_backbone["source_rating"] = pd.Categorical(disp_in_diffs_backbone["source_rating"], categories=["A*", "A", "B", "C"])
disp_in_diffs_backbone["target_rating"] = pd.Categorical(disp_in_diffs_backbone["target_rating"], categories=["A*", "A", "B", "C"])

disp_in_diffs_backbone.shape

In [None]:
eval_disp_in_diffs = []
for f in sorted(disp_in_diffs_backbone["source_FoR"].unique()):        
    bb_for = disp_in_diffs_backbone[disp_in_diffs_backbone["source_FoR"]==f]
    n = len(set(bb_for["source"]) | set(bb_for["target"]))
    mat = pd.crosstab(bb_for["source_rating"], bb_for["target_rating"])
    #print(mat)

    mat_np = np.array(mat)
    e = np.triu(mat_np, k=1).sum()/mat_np.sum()
    diag = np.diag(mat_np).sum()/mat_np.sum()
    #print(mat_np.sum(), "/", e, "\n")

    eval_disp_in_diffs.append((f, e, diag))    

In [None]:
eval_disp_in_diffs = pd.DataFrame(eval_disp_in_diffs, columns=["FoR", "misaligned_rate", "diag_rate"])
eval_disp_in_diffs["misaligned_rate"].mean(), eval_disp_in_diffs["diag_rate"].mean()

In [None]:
eval_agg = eval_disparity.merge(eval_disp_in_diffs, on="FoR")
eval_agg.columns = ["FoR", "misaligned_rate_disp", "diag_rate_disp", "misaligned_rate_disp_in_diffs", "diag_rate_disp_in_diffs"]
eval_agg["misaligned_rate_disp"] = eval_agg["misaligned_rate_disp"].round(3)
eval_agg["misaligned_rate_disp_in_diffs"] = eval_agg["misaligned_rate_disp_in_diffs"].round(3)
eval_agg["desc"] = eval_agg["FoR"].map(for_to_desc)
eval_agg.to_csv("./outputs/abdc_eval_agg.csv", index=False)
eval_agg