In [48]:
import pandas as pd
import biblib.bib
import re
import graphviz
import os
import sys
import math

In [12]:
def build_cite_key(authors, year, title):
    invalid = '<>:"/\|?*{}()\''

    authors = re.sub(r'\\. ', '', authors)
    authors = re.sub(r'\\.', '', authors)
    authors = authors.replace('ı', 'i')
    authors = authors.replace('.', '')
    authors = authors.replace('-', ' ')
    for char in invalid: authors = authors.replace(char, '')

    title_invalid = '!.,$&#`+;’‐–‘^'
    title_proc = title.lower().replace(' ', '_')
    title_proc = title_proc.replace('~','_')
    title_proc = title_proc.replace('ı', 'i')
    title_proc = title_proc.replace('ó', 'o')
    for char in invalid: title_proc = title_proc.replace(char, '')
    for char in title_invalid: title_proc = title_proc.replace(char, '')
    title_proc = title_proc.replace('&amp;', '')
    title_proc = title_proc.replace('amp;', '')

    def_authors_regex = re.compile(r'^[a-z]+( [a-z]+)*,')
    def_authors_match = re.match(def_authors_regex, authors.lower())

    if(('and' not in authors) and (',' not in authors)): # SINGLE AUTHOR
        author_proc = authors.split().pop(-1).lower()
        cite_key = f"{author_proc}_{year}_{title_proc}"
    elif(bool(def_authors_match)): # SURNAME(S), NAME(S)
        author_proc = def_authors_match[0].replace(',','').replace(' ', '_')
        cite_key = f"{author_proc}_{year}_{title_proc}"
    else: # NAME(S) SURNAME(S) AND ...
        ns_authors_regex = re.compile(r'[a-z]+ \band\b')
        author_proc = ns_authors_regex.search(authors.lower()).group().replace(' and', '')
        cite_key = f"{author_proc}_{year}_{title_proc}"
    
    cite_key = cite_key.replace('-', '_')
    for char in invalid: cite_key = cite_key.replace(char, '')
    
    return cite_key

In [13]:
def get_glossary():
    with open('Glossary.bib', 'r') as glossary:
        db = biblib.bib.Parser().parse(glossary, log_fp=sys.stderr).get_entries()
        return db

In [14]:
def standardise_keys(bs_fs, venues):
    for key in venues:
        for venue in venues[key]:
            print(f'---------{venue}---------')

            with os.scandir(f'../References/Snowballing/{bs_fs}/{key}/{venue}') as it:
                for entry in it:
                    if entry.name.endswith(".bib") and entry.is_file():
                        with open(entry, 'r') as fp:
                            db = biblib.bib.Parser().parse(fp, log_fp=sys.stderr).get_entries()

                        for ent in db.values():
                            print(ent.key)
                            cite_key = build_cite_key(ent['author'], ent['year'], ent['title'])

                            with open(entry) as r:
                                text = r.read().replace(f'{{{ent.key}', f'{{{cite_key}')
                            with open(entry, "w") as w:
                                w.write(text)

In [15]:
venues = {
    'Conferences': ['ACL','CIKM','COLING','EACL','ECIR','ECML PKDD','EMNLP','KDD','NAACL','SIGIR'],
    'Journals': ['Decision Support Systems', 'Expert Systems with Applications', 'Information Sciences', 'Journal of Infometrics', 'Knowledge-Based Systems', 'Pattern Recognition']
}

In [17]:
standardise_keys('Backward snowballing (Outgoing references)', venues)
standardise_keys('Forward snowballing (Incoming citations)', venues)

---------ACL---------
boyd_graber_2017_applications_of_topic_models
srivastava_2017_autoencoding_variational_inference_for_topic_models
lau_2017_topically_driven_neural_language_model
he_2017_efficient_correlated_topic_modeling_with_topic_embedding
benton_2018_deep_dirichlet_multinomial_regression
zhou_2016_nonparametric_bayesian_negative_binomial_factor_analysis
srivastava_2017_autoencoding_variational_inference_for_topic_models
zhao_2019_variational_autoencoders_for_sparse_and_overdispersed_discrete_data
nan_2019_topic_modeling_with_wasserstein_autoencoders
naesseth_2017_reparameterization_gradients_through_acceptance_rejection_sampling_algorithms
miao_2017_discovering_discrete_latent_topics_with_neural_variational_inference
gupta_2019_document_informed_neural_autoregressive_topic_models_with_distributional_prior
choi_2018_topicontiles_tile_based_spatio_temporal_event_analytics_via_exclusive_topic_modeling_on_social_media
giorgi_2018_the_remarkable_benefit_of_user_level_aggregation_f

In [34]:
bs_selection = [
    'xiang_2017_a_comparative_analysis_of_major_online_review_platforms_implications_for_social_media_analytics_in_hospitality_and_tourism',
    'liu_2017_an_investigation_of_brand_related_user_generated_content_on_twitter',
    'kim_2019_an_ontology_based_labeling_of_influential_topics_using_topic_network_analysis',
    'huang_2018_analyst_information_discovery_and_interpretation_roles_a_topic_modeling_approach',
    'maier_2018_applying_lda_topic_modeling_in_communication_research_toward_a_valid_and_reliable_methodology',
    'karami_2018_characterizing_diabetes_diet_exercise_and_obesity_comments_on_twitter',
    'aletras_2017_evaluating_topic_representations_for_exploring_document_collections',
    'smith_2017_evaluating_visual_representations_for_topic_understanding_and_their_effects_on_manually_generated_topic_labels',
    'savin_2021_free_associations_of_citizens_and_scientists_with_economic_and_green_growth_a_computational_linguistics_analysis',
    'syed_2017_full_text_or_abstract_examining_topic_coherence_scores_using_latent_dirichlet_allocation',
    'li_2020_global_surveillance_of_covid_19_by_mining_news_media_using_a_multi_source_dynamic_embedded_topic_model',
    'alp_2018_identifying_topical_influencers_on_twitter_based_on_user_behavior_and_network_topology',
    'morstatter_2017_in_search_of_coherence_and_consensus_measuring_the_interpretability_of_statistical_topics',
    'light_2017_managing_the_boundaries_of_taste_culture_valuation_and_computational_social_science',
    'clare_2019_modelling_research_topic_trends_in_community_forestry',
    'nerghes_2019_narratives_of_the_refugee_crisis_a_comparative_study_of_mainstream_media_and_twitter',
    'campos_2020_recommendation_system_for_knowledge_acquisition_in_moocs_ecosystems',
    'zhang_2017_scientific_evolutionary_pathways_identifying_and_visualizing_relationships_for_scientific_topics',
    'bagozzi_2018_the_politics_of_scrutiny_in_human_rights_monitoring_evidence_from_structural_topic_models_of_us_state_department_human_rights_reports',
    'dahal_2019_topic_modeling_and_sentiment_analysis_of_global_climate_change_tweets',
    'grajzl_2019_toward_understanding_17th_century_english_culture_a_structural_topic_model_of_francis_bacontextquotesingles_ideas',
    'hoang_2019_towards_autoencoding_variational_inference_for_aspect_based_opinion_summary',
    'herzog_2018_transfer_topic_labeling_with_domain_specific_knowledge_base_an_analysis_of_uk_house_of_commons_speeches_1935_2014',
    'karami_2019_twitter_speaks_a_case_of_national_disaster_situational_awareness',
    'gourru_2018_united_we_stand_using_multiple_strategies_for_topic_labeling',
    'kuhn_2018_using_structural_topic_modeling_to_identify_latent_topics_and_trends_in_aviation_incident_reports',
    'chen_2020_what_are_moocs_learners_concerns_text_analysis_of_reviews_for_computer_science_courses'
]

fs_selection = [
    'stamolampros_2019_job_satisfaction_and_employee_turnover_determinants_in_high_contact_services_insights_from_employeesonline_reviews',
    'chen_2020_detecting_latent_topics_and_trends_in_educational_technologies_over_four_decades_using_structural_topic_modeling_a_retrospective_of_all_volumes_of_computers__education',
    'stamolampros_2020_harnessing_the_wisdom_of_employees_from_online_reviews',
    'amat_lefort_2022_quality_40_big_data_analytics_to_explore_service_quality_attributes_and_their_relation_to_user_sentiment_in_airbnb_reviews',
    'ding_2020_employing_structural_topic_modelling_to_explore_perceived_service_quality_attributes_in_airbnb_accommodation',
    'yang_2021_revealing_industry_challenge_and_business_response_to_covid_19_a_text_mining_approach',
    'hagen_2019_open_data_visualizations_and_analytics_as_tools_for_policy_making',
    'monselise_2021_topics_and_sentiments_of_public_concerns_regarding_covid_19_vaccines_social_media_trend_analysis',
    'aman_2021_listen_to_e_scooter_riders_mining_rider_satisfaction_factors_from_app_store_reviews',
    'luo_2020_topic_modelling_for_theme_park_online_reviews_analysis_of_disneyland',
    'meena_2022_online_food_delivery_companies_performance_and_consumers_expectations_during_covid_19_an_investigation_using_machine_learning_approach',
    'singh_2022_modeling_the_public_attitude_towards_organic_foods_a_big_data_and_text_mining_approach',
    'wang_2020_a_topic_based_patent_analytics_approach_for_exploring_technological_trends_in_smart_manufacturing',
    'he_2021_automatic_topic_labeling_using_graph_based_pre_trained_neural_embedding',
    'chen_2020_a_structural_topic_modeling_based_bibliometric_study_of_sentiment_analysis_literature',
    'chen_2022_a_decade_of_sentic_computing_topic_modeling_and_bibliometric_analysis',
    'yoo_2023_exploring_the_nexus_between_food_and_vegn_lifestyle_via_text_mining_based_online_community_analytics',
    'chung_2022_understanding_music_streaming_services_via_text_mining_of_online_customer_reviews',
    'jebari_2021_the_use_of_citation_context_to_detect_the_evolution_of_research_topics_a_large_scale_analysis',
    'huang_2022_identification_of_topic_evolution_network_analytics_with_piecewise_linear_representation_and_word_embedding',
    'zhang_2021_topic_evolution_disruption_and_resilience_in_early_covid_19_research',
    'ebadi_2021_understanding_the_temporal_evolution_of_covid_19_research_through_machine_learning_and_natural_language_processing',
    'barrera_barrera_2022_selecting_the_appropriate_leading_journal_in_hospitality_and_tourism_research_a_guide_based_on_the_topic_journal_fit_and_the_jcr_impact_factor',
    'truica_2021_tlatr_automatic_topic_labeling_using_automatic_domain_specific_term_recognition',
    'he_2019_automatic_labeling_of_topic_models_using_graph_based_ranking',
    'rosati_2022_moving_beyond_word_lists_towards_abstractive_topic_labels_for_human_like_topics_of_scientific_documents',
    'he_2021_automatic_topic_labeling_model_with_paired_attention_based_on_pre_trained_deep_neural_network',
    'ojo_2021_what_matters_most_to_patients_on_the_core_determinants_of_patient_experience_from_free_text_feedback',
    'scelsi_2021_principled_analysis_of_energy_discourse_across_domains_with_thesaurus_based_automatic_topic_labeling',
    'symitsi_2020_the_informational_value_of_employee_online_reviews',
    'chin_2019_ondemand_recent_personal_tweets_summarization_on_mobile_devices'
]

cite_key_trim_regex = re.compile('(^([a-z]+_)+[0-9]+)')

In [82]:
# Initial selection + all BS papers after query
# Papers from initial selection are colored skyblue
# Selected papers from BS are colored mistyrose
# Node size increases based on nr of incoming edges
def build_bs_graph(path, venues, snowb_selection, print_rank = False, min_incoming_edges = 1, selection_only = False):
    snowb_style = []
    incoming_edges = {}
    year_groups = {}

    # Count incoming edges for referenced papers
    for key in venues:
        for venue in venues[key]:
            with os.scandir(f'{path}/{key}/{venue}') as it:
                for entry in it:
                    if entry.name.endswith(".bib") and entry.is_file():
                        source = entry.name.replace('.bib','')
                        if(source not in incoming_edges): 
                            incoming_edges[source] = 0
                            year = cite_key_trim_regex.search(source).group()[-4:]
                            if(year not in year_groups): year_groups[year] = [source]
                            else: year_groups[year].append(source)
                        with open(entry, 'r') as fp:
                            db = biblib.bib.Parser().parse(fp, log_fp=sys.stderr).get_entries()
                        
                        for ent in db.values():
                            cite_key = build_cite_key(ent['author'], ent['year'], ent['title'])

                            if(selection_only == False or cite_key in snowb_selection):
                                if(cite_key in incoming_edges): incoming_edges[cite_key] += 1
                                else: incoming_edges[cite_key] = 1

                                year = cite_key_trim_regex.search(cite_key).group()[-4:]
                                if(year not in year_groups): year_groups[year] = [cite_key]
                                else: year_groups[year].append(cite_key)

    print('digraph G {')
    for key in venues:
        for venue in venues[key]:
            with os.scandir(f'{path}/{key}/{venue}') as it:
                for entry in it:
                    if entry.name.endswith(".bib") and entry.is_file():
                        source = entry.name.replace('.bib','')
                        with open(entry, 'r') as fp:
                            db = biblib.bib.Parser().parse(fp, log_fp=sys.stderr).get_entries()

                        out_count = 0 # Avoid generating initial selection nodes with no outgoing edges
                        print_list = []
                        for ent in db.values():
                            cite_key = build_cite_key(ent['author'], ent['year'], ent['title'])

                            if(selection_only == False or cite_key in snowb_selection):
                                if(incoming_edges[cite_key] >= min_incoming_edges):
                                    if(out_count == 0):
                                        print_list.append(f'\t{source} [label={cite_key_trim_regex.search(source).group()} color=skyblue style=filled fixedsize=true width={2} height={2}]\n')
                                        print_list.append(f'\t{source} -> {{')
                                    out_count += 1

                                    size_increase = 1 if incoming_edges[cite_key] == 0 else math.log(incoming_edges[cite_key]) + 1
                                    if cite_key in snowb_selection: 
                                        snowb_style.append(f'\t{cite_key} [label={cite_key_trim_regex.search(cite_key).group()} color=mistyrose style=filled fixedsize=true width={2 * size_increase} height={2 * size_increase}]')
                                    else:
                                        snowb_style.append(f'\t{cite_key} [label={cite_key_trim_regex.search(cite_key).group()} fixedsize=true width={2 * size_increase} height={2 * size_increase}]')
                        
                                    print_list.append(f'{cite_key}; ')
                        for print_str in print_list: print(print_str, end='') 
                        if(len(print_list) > 0): print(f'}}')

    # Color FS papers
    for snowb_paper_style in snowb_style: print(snowb_paper_style)

    # Sort by publication year
    year_keys = list(year_groups.keys())
    year_keys.sort()
    year_groups = {i: year_groups[i] for i in year_keys}

    if(print_rank):
        for year in year_groups:
            print(f'\t{{ rank = same; ', end='')
            for paper in year_groups[year]:
                print(f'{paper}; ', end='')
            print('}')

    print('}')

In [88]:
# Initial selection + all FS papers after query
# Papers from initial selection are colored skyblue
# Selected papers from FS are colored lightgoldenrodyellow
# Node size increases based on nr of incoming edges
def build_fs_graph(path, venues, snowb_selection, print_rank = False, min_outgoing_edges = 1, selection_only = False):
    snowb_style = []
    incoming_edges = {}
    outgoing_edges = {}
    year_groups = {}

    # Count incoming edges for initial selection
    for key in venues:
        for venue in venues[key]:
            with os.scandir(f'{path}/{key}/{venue}') as it:
                for entry in it:
                    if entry.name.endswith(".bib") and entry.is_file():
                        source = entry.name.replace('.bib','')
                        if(source not in incoming_edges): 
                            incoming_edges[source] = 0
                            year = cite_key_trim_regex.search(source).group()[-4:]
                            if(year not in year_groups): year_groups[year] = [source]
                            else: year_groups[year].append(source)
                        with open(entry, 'r') as fp:
                            db = biblib.bib.Parser().parse(fp, log_fp=sys.stderr).get_entries()
                        
                        for ent in db.values():
                            cite_key = build_cite_key(ent['author'], ent['year'], ent['title'])

                            if(selection_only == False or cite_key in snowb_selection):
                                incoming_edges[source] += 1
                                if(cite_key in outgoing_edges): outgoing_edges[cite_key] += 1
                                else: outgoing_edges[cite_key] = 1

                                year = cite_key_trim_regex.search(cite_key).group()[-4:]
                                if(year not in year_groups): year_groups[year] = [cite_key]
                                else: year_groups[year].append(cite_key)

    print('digraph G {')
    for key in venues:
        for venue in venues[key]:
            with os.scandir(f'{path}/{key}/{venue}') as it:
                for entry in it:
                    if entry.name.endswith(".bib") and entry.is_file():
                        source = entry.name.replace('.bib','')

                        if(incoming_edges[source] != 0): # Avoid generating initial selection nodes with no incoming edges
                            size_increase = math.log(incoming_edges[source]) + 1
                            print(f'\t{source} [label={cite_key_trim_regex.search(source).group()} color=skyblue style=filled fixedsize=true width={2 * size_increase} height={2 * size_increase}]')
                            with open(entry, 'r') as fp:
                                db = biblib.bib.Parser().parse(fp, log_fp=sys.stderr).get_entries()

                            print(f'\t{{', end='')
                            for ent in db.values():
                                cite_key = build_cite_key(ent['author'], ent['year'], ent['title'])
                                if(selection_only == False or cite_key in snowb_selection):
                                    if(outgoing_edges[cite_key] >= min_outgoing_edges):
                                        if cite_key in snowb_selection: 
                                            snowb_style.append(f'\t{cite_key} [label={cite_key_trim_regex.search(cite_key).group()} color=lightgoldenrodyellow style=filled fixedsize=true width={2} height={2}]')
                                        else:
                                            snowb_style.append(f'\t{cite_key} [label={cite_key_trim_regex.search(cite_key).group()} fixedsize=true width={2} height={2}]')
                            
                                        print(f'{cite_key}; ', end='')
                            print(f'}} -> {source}')

    # Color FS papers
    for snowb_paper_style in snowb_style: print(snowb_paper_style)

    # Sort by publication year
    year_keys = list(year_groups.keys())
    year_keys.sort()
    year_groups = {i: year_groups[i] for i in year_keys}

    if(print_rank):
        for year in year_groups:
            print(f'\t{{ rank = same; ', end='')
            for paper in year_groups[year]:
                print(f'{paper}; ', end='')
            print('}')

    print('}')

In [87]:
build_bs_graph('../References/Snowballing/Backward snowballing (Outgoing references)', venues, bs_selection, min_incoming_edges=1, selection_only=False)

digraph G {
	card_2018_neural_models_for_documents_with_metadata [label=card_2018 color=skyblue style=filled fixedsize=true width=2 height=2]
	card_2018_neural_models_for_documents_with_metadata -> {boyd_graber_2017_applications_of_topic_models; srivastava_2017_autoencoding_variational_inference_for_topic_models; lau_2017_topically_driven_neural_language_model; he_2017_efficient_correlated_topic_modeling_with_topic_embedding; benton_2018_deep_dirichlet_multinomial_regression; }
	wu_2020_neural_mixed_counting_models_for_dispersed_topic_discovery [label=wu_2020 color=skyblue style=filled fixedsize=true width=2 height=2]
	wu_2020_neural_mixed_counting_models_for_dispersed_topic_discovery -> {zhou_2016_nonparametric_bayesian_negative_binomial_factor_analysis; srivastava_2017_autoencoding_variational_inference_for_topic_models; zhao_2019_variational_autoencoders_for_sparse_and_overdispersed_discrete_data; nan_2019_topic_modeling_with_wasserstein_autoencoders; naesseth_2017_reparameterizatio

In [91]:
build_fs_graph('../References/Snowballing/Forward snowballing (Incoming citations)', venues, fs_selection, min_outgoing_edges=1, selection_only=False)

digraph G {
	card_2018_neural_models_for_documents_with_metadata [label=card_2018 color=skyblue style=filled fixedsize=true width=10.0507033814703 height=10.0507033814703]
	{hoyle_2021_is_automated_topic_model_evaluation_broken_the_incoherence_of_coherence; nguyen_2021_contrastive_learning_for_neural_topic_model; hoyle_2022_are_neural_topic_models_broken; zhu_2022_disentangled_learning_of_stance_and_aspect_topics_for_vaccine_attitude_detection_in_social_media; zhao_2021_adversarial_learning_of_poisson_factorisation_model_for_gauging_brand_sentiment_in_user_reviews; tripodi_2020_topic_modelling_games; zhao_2021_topic_modelling_meets_deep_neural_networks_a_survey; zhang_2022_pre_training_and_fine_tuning_neural_topic_model_a_simple_yet_effective_approach_to_incorporating_external_knowledge; panwar_2021_tan_ntm_topic_attention_networks_for_neural_topic_modeling; hoyle_2020_improving_neural_topic_models_using_knowledge_distillation; } -> card_2018_neural_models_for_documents_with_metadata
	

In [26]:
# Selected BS papers which are cited by more than one paper in the initial selection
# Papers from initial selection are colored skyblue
bs_style = []
bs_counts = {}
glossary = get_glossary()

# Count incoming edges for bs papers
for key in venues:
    for venue in venues[key]:
        with os.scandir(f'../References/Snowballing/Backward snowballing (Outgoing references)/{key}/{venue}') as it:
            for entry in it:
                if entry.name.endswith(".bib") and entry.is_file():
                    source = entry.name.replace('.bib','')
                    with open(entry, 'r') as fp:
                        db = biblib.bib.Parser().parse(fp, log_fp=sys.stderr).get_entries()
                    
                    for ent in db.values():
                        cite_key = build_cite_key(ent['author'], ent['year'], ent['title'])
                        if cite_key in bs_selection: 
                            if(cite_key in bs_counts): bs_counts[cite_key] += 1
                            else: bs_counts[cite_key] = 1

print('digraph G {')
for key in venues:
    for venue in venues[key]:
        with os.scandir(f'../References/Snowballing/Backward snowballing (Outgoing references)/{key}/{venue}') as it:
            for entry in it:
                if entry.name.endswith(".bib") and entry.is_file():
                    source = entry.name.replace('.bib','')
                    with open(entry, 'r') as fp:
                        db = biblib.bib.Parser().parse(fp, log_fp=sys.stderr).get_entries()

                    ent_count = 0
                    for ent in db.values():
                        cite_key = build_cite_key(ent['author'], ent['year'], ent['title'])
                        if (cite_key in bs_counts) and (bs_counts[cite_key] > 1): ent_count +=1
                    if ent_count == 0: continue

                    title = glossary[source]['title']
                    print(f'\t{source} [label="{cite_key_trim_regex.search(source).group()}, {title}" color=skyblue style=filled]')
                    print(f'\t{source} -> {{', end='')

                    for ent in db.values():
                        cite_key = build_cite_key(ent['author'], ent['year'], ent['title'])

                        if (cite_key in bs_counts) and (bs_counts[cite_key] > 1): 
                            title = glossary[cite_key]['title']
                            bs_style.append(f'\t{cite_key} [label="{cite_key_trim_regex.search(cite_key).group()}, {title}"]')
                            print(f'{cite_key}; ', end='')
                    print(f'}}')
for bs_paper_style in bs_style: print(bs_paper_style)
print('}')

digraph G {
	hosseiny_marani_2022_one_rating_to_rule_them_all_evidence_of_multidimensionality_in_human_assessment_of_topic_labeling_quality [label="hosseiny_marani_2022, One {Rating} to {Rule} {Them} {All}?: {Evidence} of {Multidimensionality} in {Human} {Assessment} of {Topic} {Labeling} {Quality}" color=skyblue style=filled]
	hosseiny_marani_2022_one_rating_to_rule_them_all_evidence_of_multidimensionality_in_human_assessment_of_topic_labeling_quality -> {smith_2017_evaluating_visual_representations_for_topic_understanding_and_their_effects_on_manually_generated_topic_labels; }
	sorodoc_2017_multimodal_topic_labelling [label="sorodoc_2017, Multimodal {Topic} {Labelling}" color=skyblue style=filled]
	sorodoc_2017_multimodal_topic_labelling -> {aletras_2017_evaluating_topic_representations_for_exploring_document_collections; }
	doogan_2021_topic_model_or_topic_twaddle_re_evaluating_semantic_interpretability_measures [label="doogan_2021, Topic {Model} or {Topic} {Twaddle}? {Re}-evaluatin