In [35]:
import pandas as pd  #  type: ignore

pd.set_option("display.max_rows", 100)

from techminer2.co_occurrence_network import terms_by_cluster_frame  # type: ignore
from techminer2.thesaurus.descriptors import apply_thesaurus  # type: ignore
from techminer2.thesaurus.descriptors import check_thesaurus  # type: ignore
from techminer2.thesaurus.descriptors import clean_thesaurus  # type: ignore

clean_thesaurus(root_dir="../")
check_thesaurus(root_dir="../")
apply_thesaurus(root_dir="../")

frame = terms_by_cluster_frame(
    #
    # COLUMN PARAMS:
    field="descriptors",
    #
    top_n=None,
    occ_range=(4, None),
    gc_range=(None, None),
    custom_terms=None,
    #
    # NETWORK PARAMS:
    algorithm_or_dict="louvain",
    association_index="association",
    #
    # DATABASE PARAMS:
    root_dir="../",
    database="main",
    year_filter=(None, None),
    cited_by_filter=(None, None),
    document_type=[
        "Article",
        "Book chapter",
        "Conference paper",
        "Review",
    ],
)
frame.head(10)

--INFO-- The ../thesauri/descriptors.the.txt thesaurus has been cleaned.
--INFO-- Checking `descriptors.the.txt` integrity.
--INFO-- Applying `descriptors.the.txt` thesaurus to author/index keywords and abstract/title words


Unnamed: 0,0,1,2
0,CURRICULUM 39:475,STUDENTS 41:370,TEACHING 19:238
1,SKILLS 33:444,COURSES 18:082,INFORMATION_MANAGEMENT 08:189
2,EDUCATION 26:205,EXPERIENTIAL_LEARNING 14:124,ACADEMIC_PROFESSIONALS 07:031
3,INFORMATION_USE 17:089,DECISION_MAKING 08:021,PEDAGOGY 06:203
4,BUSINESS_SCHOOLS 15:171,LEARNING 07:033,COMPETITIVE_INTELLIGENCE 06:185
5,CURRICULUM_DESIGN 14:174,STUDENT_LEARNING 07:030,CURRICULUM_DEVELOPMENT 06:168
6,PROGRAMS 09:024,STUDENT_ENGAGEMENT 06:017,ELECTRONIC_LEARNING 05:012
7,EDUCATION_INSTITUTIONS 07:081,INSIGHTS 06:016,TECHNOLOGY 04:115
8,COMPETITIVE_ADVANTAGES 06:092,LEARNING_OUTCOMES 05:014,ANALYTICS_TOOLS 04:003
9,KNOWLEDGE 06:038,ANALYTICS_PROFESSIONALS 04:105,BUSINESS_DATA 04:003


In [36]:
#
# TABLE
#
from techminer2.co_occurrence_network import terms_by_cluster_summary  # type: ignore


cluster_data = terms_by_cluster_summary(
    field="descriptors",
    #
    conserve_counters=False,
    #
    top_n=None,
    occ_range=(4, None),
    gc_range=(None, None),
    custom_terms=None,
    #
    algorithm_or_dict="louvain",
    association_index="association",
    #
    root_dir="../",
    year_filter=(None, None),
    document_type=[
        "Article",
        "Book chapter",
        "Conference paper",
        "Review",
    ],
)

cluster_data["Terms"] = cluster_data["Terms"].str.split("; ")
cluster_data["Terms"] = cluster_data["Terms"].map(
    lambda x: [t.strip().replace("_", " ").title() for t in x[:10]]
)
cluster_data["Terms"] = cluster_data["Terms"].str.join("; ")

cluster_data.to_csv("../reports/sec_42a_clusters/clusters.csv", sep="\t", index=False)

print(cluster_data.to_string())

   Cluster  Num Terms  Percentage                                                                                                                                                                                  Terms
0        0         21        44.7                               Curriculum; Skills; Education; Information Use; Business Schools; Curriculum Design; Programs; Education Institutions; Competitive Advantages; Knowledge
1        1         15        31.9                        Students; Courses; Experiential Learning; Decision Making; Learning; Student Learning; Student Engagement; Insights; Learning Outcomes; Analytics Professionals
2        2         11        23.4  Teaching; Information Management; Academic Professionals; Pedagogy; Competitive Intelligence; Curriculum Development; Electronic Learning; Technology; Analytics Tools; Business Data


In [38]:
#
# PAPERS PER CLUSTER
#
from techminer2.documents import select_documents  # type: ignore


#
# PREPARATION:
#
cluster_data["Terms"] = cluster_data["Terms"].str.split("; ")
cluster_data["Terms"] = cluster_data["Terms"].map(
    lambda x: [t.strip().replace(" ", "_").upper() for t in x]
)
cluster_data["Terms"] = cluster_data["Terms"].str.join("; ")


#
# DOCUMENTS PUBLISHED WITH AT LEAST 1 CITATION:
#
for i_cluster in range(cluster_data.shape[0]):

    descriptors = cluster_data.Terms[i_cluster]
    descriptors = descriptors.split("; ")

    documents = select_documents(
        #
        # DATABASE PARAMS:
        root_dir="../",
        database="main",
        year_filter=(None, None),
        cited_by_filter=(None, None),
        sort_by="date_newest",
        descriptors=descriptors,
    )

    documents = [doc for doc in documents if "AB " in doc]

    documents = documents[:100]

    documents = [documents[i : i + 10] for i in range(0, len(documents), 10)]

    for i_group, group in enumerate(documents):

        group = "\n---\n\n".join(group)
        group = "---\n\n" + group + "\n\n---"
        i_group = f"{i_group:02d}"
        with open(
            f"../reports/sec_42a_clusters/{i_cluster}/year_{i_group}.txt",
            "wt",
            encoding="utf-8",
        ) as f:
            print(group, file=f)