In [None]:
from itertools import permutations
from pathlib import Path

import numpy as np

from computations.clustering import clustering_integration
from computations.doc2vec import doc2vec_integration
from computations.tfidf import tfidf_integration
from evaluation.grid_search import tune_parameters, weight_combinations
from textbooks.data import Textbook
from textbooks.utils import (
    extract_concept_definition,
    extract_concept_name,
    extract_concept_subject,
    extract_content,
    extract_header,
)

base_textbook = Textbook.from_json(
    Path("textbooks-parsed/2012_Book_ModernMathematicalStatisticsWi.json")
)
other_textbooks = [
    Textbook.from_json(Path("textbooks-parsed/Walpole_Probability_and_Statistics.json"))
]

# Base Models

In [None]:
parameter_tuning = {
    "TF-IDF (Single Attribute)": {
        "fn": tfidf_integration,
        "text_extraction_fns": {
            "header": [extract_header],
            "content": [extract_content],
        },
        "threshold": np.arange(0.2, 0.8, 0.2),
        "iterative": [False],
    },
    "TF-IDF (Dual Attribute)": {
        "fn": tfidf_integration,
        "text_extraction_fns": {
            "header + concept name": [extract_header, extract_concept_name],
        },
        "threshold": np.arange(0.2, 0.8, 0.2),
        "weights": [None] + weight_combinations(2, step=0.2),
        "iterative": [False],
    },
    "TF-IDF (Triple Attribute)": {
        "fn": tfidf_integration,
        "text_extraction_fns": {
            "header + concept name + concept definitions": [
                extract_header,
                extract_concept_name,
                extract_concept_definition,
            ],
            "header + concept subject + concept definitions": [
                extract_header,
                extract_concept_subject,
                extract_concept_definition,
            ],
            "header + concept subject + concept name": [
                extract_header,
                extract_concept_subject,
                extract_concept_name,
            ],
            "concept definitions + concept subject + concept name": [
                extract_concept_definition,
                extract_concept_subject,
                extract_concept_name,
            ],
        },
        "threshold": np.arange(0.2, 0.8, 0.2),
        "weights": [None] + weight_combinations(3, step=0.2),
        "iterative": [False],
    },
    "TF-IDF (Quad Attribute)": {
        "fn": tfidf_integration,
        "text_extraction_fns": {
            "header + concept name + concept definitions + concept subject": [
                extract_header,
                extract_concept_name,
                extract_concept_definition,
                extract_concept_subject,
            ]
        },
        "threshold": np.arange(0.2, 0.8, 0.2),
        "weights": [None]
        + [x for x in weight_combinations(3, step=0.1) if 0.1 not in x],
        "iterative": [False],
    },
    "Doc2Vec": {
        "fn": doc2vec_integration,
        "text_extraction_fn": {"content": extract_content},
        "threshold": [0.2, 0.4, 0.5],
        "vector_size": [50, 100, 200, 300],
        "min_count": [1, 5, 10, 20, 50],
        "epochs": [40],
        "iterative": [False],
    },
    "Clustering (Single Attribute)": {
        "fn": clustering_integration,
        "category_extraction_fns": {
            "subject": [extract_concept_subject],
            "name": [extract_concept_name],
        },
        "n_clusters_options": [[40], [60], [80], [100], [120]]
    },
    "Clustering (Dual Attribute)": {
        "fn": clustering_integration,
        "category_extraction_fns": {
            "subject+name": [extract_concept_subject, extract_concept_name]
        },
        "weights": [None] + weight_combinations(2, 0.2),
        "n_clusters_options": [
            [40, 40],
            [60, 60],
            [80, 80],
            [100, 100],
            [120, 120],
            [60, 100],
            [60, 80],
            [100, 80],
            [100, 60],
            [80, 60],
            [80, 100],
        ],
        "threshold": np.arange(0.2, 0.8, 0.2),
    },
}

tune_parameters(
    base_textbook,
    other_textbooks,
    parameter_tuning,
    # "evaluation-data/local-base-results.json",
    "evaluation-data/clustering.json",
)

# Ensemble

In [None]:
from itertools import combinations

from computations.clustering import tfidf_clustering_ensemble_integration

parameter_tuning = {
    "TF-IDF + Clustering (Single Attribute)": {
        "fn": tfidf_clustering_ensemble_integration,
        "text_extraction_fns": {
            "header+concept_name+concept_definition+concept_subject": [
                extract_header,
                extract_concept_name,
                extract_concept_definition,
                extract_concept_subject,
            ]
        },
        "category_extraction_fns": {
            "concept name": [extract_concept_name],
            "concept subject": [extract_concept_subject],
        },
        "threshold": np.arange(0.2, 1, 0.2),
        "weights": [[x, 1, 1, 1, 1] for x in range(1, 7)],
        "n_clusters_options": [[40], [60], [80], [100], [120]],
    },
    "TF-IDF + Clustering (Dual Attribute)": {
        "fn": tfidf_clustering_ensemble_integration,
        "text_extraction_fns": {
            "header+concept_name+concept_definition+concept_subject": [
                extract_header,
                extract_concept_name,
                extract_concept_definition,
                extract_concept_subject,
            ]
        },
        "category_extraction_fns": {
            "concept name+subject": [extract_concept_name, extract_concept_subject],
        },
        "threshold": np.arange(0.2, 1, 0.2),
        "weights": [
            [a, b, 1, 1, 1, 1]
            for a, b in combinations(range(1, 10, 2), r=2)
            if a + b < 10
        ],
        "n_clusters_options": [
            [60, 100],
            [60, 80],
            [100, 80],
            [100, 60],
            [80, 60],
            [80, 100],
        ],
    },
}


tune_parameters(
    base_textbook,
    other_textbooks,
    parameter_tuning,
    "evaluation-data/local-ensemble-results.json",
)

# Iterative Learning

In [None]:
parameter_tuning = {
    "TF-IDF (Single Attribute, iterative)": {
        "fn": tfidf_integration,
        "text_extraction_fns": {"content": [extract_content]},
        "threshold": np.arange(0.2, 0.61, 0.2),
        "iterative": [True],
    },
    "TF-IDF (Triple Attribute, iterative)": {
        "fn": tfidf_integration,
        "text_extraction_fns": {
            "header + concept name + concept definitions": [
                extract_header,
                extract_concept_name,
                extract_concept_definition,
            ],
            "header + concept subject + concept definitions": [
                extract_header,
                extract_concept_subject,
                extract_concept_definition,
            ],
            "header + concept subject + concept name": [
                extract_header,
                extract_concept_subject,
                extract_concept_name,
            ],
            "concept definitions + concept subject + concept name": [
                extract_concept_definition,
                extract_concept_subject,
                extract_concept_name,
            ],
        },
        "threshold": np.arange(0.2, 0.61, 0.2),
        "iterative": [True],
    },
    "TF-IDF (Quad Attribute, iterative)": {
        "fn": tfidf_integration,
        "text_extraction_fns": {
            "header + concept name + concept definitions + concept subject": [
                extract_header,
                extract_concept_name,
                extract_concept_definition,
                extract_concept_subject,
            ]
        },
        "threshold": np.arange(0.2, 0.61, 0.2),
        "iterative": [True],
    },
    "Doc2Vec (iterative)": {
        "fn": doc2vec_integration,
        "text_extraction_fn": {"content": extract_content},
        "threshold": [0.4, 0.5, 0.6],
        "vector_size": [50, 100, 200],
        "min_count": [1, 5, 10, 20],
        "epochs": [40],
        "iterative": [True],
    },
}


tune_parameters(
    base_textbook,
    other_textbooks,
    parameter_tuning,
    "evaluation-data/local-iterative-results.json",
)

# Pipeline

In [None]:
from computations.pipeline import tfidf_doc2vec_pipeline


parameter_tuning = {
    "TF-IDF + Doc2Vec Pipeline": {
        "fn": tfidf_doc2vec_pipeline,
        "tfidf_text_extraction_fns": {
            "header+concept_name+concept_definition+concept_subject": [
                extract_header,
                extract_concept_name,
                extract_concept_definition,
                extract_concept_subject,
            ]
        },
        "tfidf_iterative": [True, False],
        "tfidf_threshold": np.arange(0.4, 1, 0.2),
        "tfidf_uncertain_threshold": [
            (a, b) for a, b in permutations(np.arange(0, 1, 0.2), r=2) if a < b
        ],
        "doc2vec_text_extraction_fn": {"content": extract_content},
        "doc2vec_threshold": [0.4, 0.5, 0.6],
        "doc2vec_iterative": [True, False],
        "doc2vec_vector_size": [50, 100, 200],
        "doc2vec_min_count": [1, 5, 10, 20],
        "doc2vec_epochs": [40],
    }
}

tune_parameters(
    base_textbook,
    other_textbooks,
    parameter_tuning,
    "evaluation-data/local-pipeline-results.json",
)