In [190]:
from os.path import join
import os
import pandas as pd
import numpy as np
import altair as alt
from altair_saver import save as alt_save

alt.data_transformers.disable_max_rows();

In [191]:
queries_dir = join("data", "queries")
plots_dir = join("data", "plots")

In [192]:
os.makedirs(plots_dir, exist_ok=True)

In [193]:
l1_df = pd.read_csv(join(queries_dir, "q06_get_l1_papers_fields_v2_with_cp.csv"), index_col=0)

In [194]:
l1_df["source"].unique()

array(['s2-fos-model', 'external'], dtype=object)

In [195]:
external_l1_df = l1_df.loc[l1_df["source"] == "external"]
internal_l1_df = l1_df.loc[l1_df["source"] == "s2-fos-model"]

In [196]:
internal_l1_df.head()

Unnamed: 0,field,source,year,citation_count,title,venue,corpus_id,doi,cited_corpus_id,method_acronym,cp_in,cp_ex
0,Computer Science,s2-fos-model,2015,0,An enhanced dimensionality reduction for multi...,"International Conference on Communication, Com...",10422622,10.1109/CCOMS.2015.7562894,5987139,LLE,36.22651,0.0
3,Computer Science,s2-fos-model,2015,48,PSF: A Unified Patient Similarity Evaluation F...,IEEE journal of biomedical and health informatics,17767462,10.1109/JBHI.2015.2425365,5987139,LLE,94.88113,94.758077
5,Medicine,s2-fos-model,2015,48,PSF: A Unified Patient Similarity Evaluation F...,IEEE journal of biomedical and health informatics,17767462,10.1109/JBHI.2015.2425365,5987139,LLE,93.2935,93.099363
7,Computer Science,s2-fos-model,2017,0,Online visual tracking with high-order pooling,IEEE International Conference on Multimedia an...,37501101,10.1109/ICME.2017.8019349,1089627,I-PCA,28.198149,0.0
8,Biology,s2-fos-model,2013,154,Differences in Adaptation Rates after Virtual ...,Journal of Neuroscience,17936211,10.1523/JNEUROSCI.0122-13.2013,4428232,NMF,97.580366,97.553337


In [197]:
plot = alt.Chart(external_l1_df).mark_rect().encode(
    x=alt.X("field:N", axis=alt.Axis(title="Field")),
    y=alt.Y('method_acronym:N', axis=alt.Axis(title="Method")),
    color=alt.Color("count()", scale=alt.Scale(type='log'))
).properties(
    title={
        "text": "Count of papers citing dimensionality reduction methods",
        "subtitle": "2013 - 2023",
        "fontSize": 16,
        "fontWeight": 500,
        "subtitleColor": "black",
        "subtitleFontSize": 12
    },
    width=400,
    height=800
)
plot.save(join(plots_dir, "00_heatmap.png"))
plot.save(join(plots_dir, "00_heatmap.svg"))

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [198]:
method_df = external_l1_df.groupby(by=["method_acronym"]).size().to_frame().rename(columns={0: "count"})
method_df

Unnamed: 0_level_0,count
method_acronym,Unnamed: 1_level_1
AE,2070
CCA,101
CHL,56
CLM,10
CuCA,320
...,...
T-SVD,139
TF,2543
UMAP,5131
VQ,16


In [199]:
sorted_method_df = method_df.reset_index().sort_values(by="count", ascending=False).reset_index(drop=True).reset_index().set_index("method_acronym")
sorted_method_df

Unnamed: 0_level_0,index,count
method_acronym,Unnamed: 1_level_1,Unnamed: 2_level_1
T-SNE,0,29930
LLE,1,10935
NMF,2,10169
ISO,3,9376
PCA,4,7919
...,...,...
KLP,71,14
CLM,72,10
RBF-MP,73,10
NL-ICA,74,4


In [200]:
field_df = external_l1_df.groupby(by=["field"]).size().to_frame().rename(columns={0: "count"})
field_df

Unnamed: 0_level_0,count
field,Unnamed: 1_level_1
Art,45
Biology,4430
Business,330
Chemistry,859
Computer Science,74494
Economics,446
Engineering,3987
Environmental Science,672
Geography,680
Geology,611


In [201]:
field_method_df = external_l1_df.groupby(by=["field", "method_acronym"]).size().to_frame().rename(columns={0: "count"}).reset_index()
field_method_df

Unnamed: 0,field,method_acronym,count
0,Art,AE,1
1,Art,DM,1
2,Art,FA,1
3,Art,H-LLE,1
4,Art,ISO,3
...,...,...,...
802,Sociology,SMA,1
803,Sociology,SNE,2
804,Sociology,T-SNE,35
805,Sociology,TF,2


In [202]:
field_method_df["fraction_in_field"] = field_method_df.apply(lambda row: row["count"] / field_df.at[row["field"], "count"], axis='columns')
field_method_df["method_or_other"] = field_method_df.apply(lambda row: "Other" if row["fraction_in_field"] <= 0.1 else row["method_acronym"], axis='columns')

In [203]:
field_method_df

Unnamed: 0,field,method_acronym,count,fraction_in_field,method_or_other
0,Art,AE,1,0.022222,Other
1,Art,DM,1,0.022222,Other
2,Art,FA,1,0.022222,Other
3,Art,H-LLE,1,0.022222,Other
4,Art,ISO,3,0.066667,Other
...,...,...,...,...,...
802,Sociology,SMA,1,0.005917,Other
803,Sociology,SNE,2,0.011834,Other
804,Sociology,T-SNE,35,0.207101,T-SNE
805,Sociology,TF,2,0.011834,Other


In [204]:
alt.Chart(field_method_df).mark_bar().encode(
    x=alt.X("field:N", axis=alt.Axis(title="Field")),
    y=alt.Y('fraction_in_field:Q', axis=alt.Axis(title="Proportion"), scale=alt.Scale(domain=[0.0, 1.0])),
    color=alt.Color("method_acronym:N", legend=alt.Legend(title="Method"))
).properties(
    title={
        "text": "Count of papers citing dimensionality reduction methods",
        "subtitle": "2013 - 2023",
        "fontSize": 16,
        "fontWeight": 500,
        "subtitleColor": "black",
        "subtitleFontSize": 12
    },
    width=400,
    height=400
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [205]:
field_method_df = field_method_df[["field", "method_or_other", "count", "fraction_in_field"]].groupby(by=["field", "method_or_other"]).sum().reset_index()

In [228]:
field_method_df["method_or_other_index"] = field_method_df["method_or_other"].apply(lambda val: sorted_method_df.shape[0] if val == "Other" else sorted_method_df.at[val, "index"])
field_method_df = field_method_df.sort_values(by="method_or_other_index", ascending=True)

In [229]:
method_ordering = sorted(
    field_method_df["method_or_other"].unique().tolist(),
    key = lambda val: sorted_method_df.shape[0] if val == "Other" else sorted_method_df.at[val, "index"]
)
method_ordering

['T-SNE', 'LLE', 'NMF', 'ISO', 'PCA', 'LDA', 'UMAP', 'N-MDS', 'FA', 'Other']

In [233]:
alt.Chart(field_method_df).mark_bar().encode(
    x=alt.X("field:N", axis=alt.Axis(title="Field")),
    y=alt.Y('count:Q', axis=alt.Axis(title="Count")),
    color=alt.Color("method_or_other:N", scale=alt.Scale(domain=method_ordering), legend=alt.Legend(title="Method")),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'method_or_other_index',
      sort='descending'
    )
).properties(
    title={
        "text": "Count of papers citing dimensionality reduction methods",
        "subtitle": "2013 - 2023",
        "fontSize": 16,
        "fontWeight": 500,
        "subtitleColor": "black",
        "subtitleFontSize": 12
    },
    width=400,
    height=500
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [236]:
alt.Chart(field_method_df).mark_bar().encode(
    x=alt.X("field:N", axis=alt.Axis(title="Field")),
    y=alt.Y('fraction_in_field:Q', axis=alt.Axis(title="Count")),
    color=alt.Color("method_or_other:N", scale=alt.Scale(domain=method_ordering), legend=alt.Legend(title="Method")),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'method_or_other_index',
      sort='descending'
    )
).properties(
    title={
        "text": "Fraction of papers citing dimensionality reduction methods",
        "subtitle": "2013 - 2023",
        "fontSize": 16,
        "fontWeight": 500,
        "subtitleColor": "black",
        "subtitleFontSize": 12
    },
    width=400,
    height=400
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [None]:
# TODO: plots without CS, Math, Eng

In [None]:
# TODO: add row for total number of citing papers in the field
# TODO: add row for total number of papers in the field in the database