In [None]:
import json
from pathlib import Path

from classification.bert import run_bert
from classification.nn import preprocess_data
from computation.expert import expert_integration
from computation.pipeline import pipeline_integration
from textbooks.data import Textbook
from textbooks.utils import extract_concept_name, extract_content

BASE_TEXTBOOK = "2012_Book_ModernMathematicalStatisticsWi"
EXPERT_OTHER_TEXTBOOK = "Walpole_Probability_and_Statistics"

RESULTS_FILE = "evaluation-data/classification.jsonl"


def append(data):
    with open(RESULTS_FILE, "a") as f:
        f.write(json.dumps(data, default=lambda x: x.__name__) + "\n")


best_integration_config = {
    "tfidf_text_extraction_fns": [extract_content, extract_concept_name],
    "tfidf_threshold": 0.6,
    "tfidf_uncertain_threshold": 0.3,
    "d2v_text_extraction_fn": extract_content,
    "d2v_threshold": 0.3,
    "d2v_vector_size": 100,
    "d2v_min_count": 1,
    "evaluate": False,
}

# Load data

In [None]:
expert_dataset = expert_integration(
    base_textbook=Textbook.from_json(f"textbooks-parsed/{BASE_TEXTBOOK}.json"),
    other_textbooks=(
        Textbook.from_json(f"textbooks-parsed/{EXPERT_OTHER_TEXTBOOK}.json"),
    ),
).dataset

print("Number of topic labels:", len(set(d["topic"] for d in expert_dataset)))
print("Number of data points:", len(expert_dataset))

In [None]:
integrated_textbook = pipeline_integration(
    base_textbook=Textbook.from_json(f"textbooks-parsed/{BASE_TEXTBOOK}.json"),
    other_textbooks=(
        Textbook.from_json(f"textbooks-parsed/{EXPERT_OTHER_TEXTBOOK}.json"),
    ),
    **best_integration_config,
)
small_generated_dataset = integrated_textbook.dataset

print("Number of topic labels:", len(set(d["topic"] for d in small_generated_dataset)))
print("Number of data points:", len(small_generated_dataset))

In [None]:
base_textbook_path = Path(f"textbooks-parsed/{BASE_TEXTBOOK}.json")
base_textbook = Textbook.from_json(base_textbook_path)
other_textbooks = [
    Textbook.from_json(p)
    for p in Path("textbooks-parsed").glob("*")
    if p != base_textbook_path
]

integrated_textbook = pipeline_integration(
    base_textbook,
    other_textbooks,
    **best_integration_config,
)
large_generated_dataset = integrated_textbook.dataset

print("Number of topic labels:", len(set(d["topic"] for d in large_generated_dataset)))
print("Number of data points:", len(large_generated_dataset))
with open("dataset.json", "w") as f:
    json.dump(large_generated_dataset, f)

# Fine-tuning & cross-validation

In [None]:
## Fine tuning
from keras.layers import LSTM, SimpleRNN

from classification.grid_search import grid_search_neural_networks
from classification.nn import reshape
from utils import performance_metrics

wide_param_grid = {
    "model__units": [100, 125, 150, 200],
    "model__dropout_rate": [0.4, 0.6, 0.8, 0.9],
    "model__model_type": [LSTM, SimpleRNN],
    "batch_size": [32, 64, 128],
}

In [None]:
def advanced_language_model_cv(dataset, param_grid, with_concepts, n_splits=5):
    X, y = run_bert(dataset, with_concepts)
    num_classes, X_train, X_test, y_train, y_test = preprocess_data(X, y)
    best_model = grid_search_neural_networks(
        num_classes=num_classes,
        X_train=reshape(X_train),
        y_train=y_train,
        param_grid=param_grid,
        n_splits=n_splits,
    )
    y_pred = best_model.predict(reshape(X_test))
    results = best_model.best_params_ | performance_metrics(y_pred, y_test)
    return results

In [None]:
## Expert data with concepts
results = advanced_language_model_cv(expert_dataset, wide_param_grid, with_concepts=True)
append(results | {"dataset": "expert", "concepts": True})
results

In [None]:
## Expert data without concepts
results = advanced_language_model_cv(
    expert_dataset, wide_param_grid, with_concepts=False
)
append(results | {"dataset": "expert", "concepts": False})
results

In [None]:
## Small generated data with concepts
results = advanced_language_model_cv(
    small_generated_dataset, wide_param_grid, with_concepts=True
)
append(results | {"dataset": "small", "concepts": True})
results

In [None]:
## Small generated data without concepts
results = advanced_language_model_cv(
    small_generated_dataset, wide_param_grid, with_concepts=False
)
append(results | {"dataset": "small", "concepts": False})
results

In [None]:
narrow_param_grid = {
    "model__units": [100, 125, 150],
    "model__dropout_rate": [0.4, 0.6, 0.8, 0.9],
    "model__model_type": [SimpleRNN],
    "batch_size": [32, 64],
}

In [None]:
## Large generated data with concepts
results = advanced_language_model_cv(
    large_generated_dataset, narrow_param_grid, with_concepts=True, n_splits=3
)
append(results | {"dataset": "large", "concepts": True})
results

In [None]:
## Large generated data without concepts
results = advanced_language_model_cv(
    large_generated_dataset, narrow_param_grid, with_concepts=False, n_splits=3
)
append(results | {"dataset": "large", "concepts": False})
results

# BERTopic Baseline

In [None]:
import pandas as pd
from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

from utils import performance_metrics


def bertopic_cv(dataset, with_concepts, n_splits=5):
    topics = [d["topic"] for d in dataset]
    le = LabelEncoder()
    encoded_topics = le.fit_transform(topics)

    docs = [
        f"{d['concepts']} {d['content']}" if with_concepts else d["content"]
        for d in dataset
    ]
    y = encoded_topics

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []

    for train_index, test_index in kf.split(docs):
        X_train, X_test = [docs[i] for i in train_index], [docs[i] for i in test_index]
        y_train, y_test = y[train_index], y[test_index]

        empty_dimensionality_model = BaseDimensionalityReduction()
        clf = LogisticRegression()
        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

        topic_model = BERTopic(
            umap_model=empty_dimensionality_model,
            hdbscan_model=clf,
            ctfidf_model=ctfidf_model,
        )
        topic_model.fit(X_train, y=y_train)
        y_pred, _ = topic_model.transform(X_test)
        result = {"fold": len(results) + 1} | performance_metrics(y_test, y_pred)
        results.append(result)

    return pd.DataFrame(results)

In [None]:
## Expert data with concepts
cv_results_expert = bertopic_cv(expert_dataset, with_concepts=True)
results = cv_results_expert.drop(columns="fold").mean().to_dict()
append(
    results | {"model__model_type": "BERTopic", "dataset": "expert", "concepts": True}
)
results

In [None]:
## Expert data without concepts
cv_results_expert = bertopic_cv(expert_dataset, with_concepts=False)
results = cv_results_expert.drop(columns="fold").mean().to_dict()
append(
    results | {"model__model_type": "BERTopic", "dataset": "expert", "concepts": False}
)
results

In [None]:
## Large generated data with concepts
cv_results_large = bertopic_cv(large_generated_dataset, with_concepts=True)
results = cv_results_large.drop(columns="fold").mean().to_dict()
append(
    results | {"model__model_type": "BERTopic", "dataset": "large", "concepts": True}
)
results

In [None]:
## Large generated data without concepts
cv_results_large = bertopic_cv(large_generated_dataset, with_concepts=False)
results = cv_results_large.drop(columns="fold").mean().to_dict()
append(
    results | {"model__model_type": "BERTopic", "dataset": "large", "concepts": False}
)
results

In [None]:
## Small generated data with concepts
cv_results_small = bertopic_cv(small_generated_dataset, with_concepts=True)
results = cv_results_small.drop(columns="fold").mean().to_dict()
append(
    results | {"model__model_type": "BERTopic", "dataset": "small", "concepts": True}
)
results

In [None]:
## Small generated data without concepts
cv_results_small_generated = bertopic_cv(small_generated_dataset, with_concepts=False)
results = cv_results_small.drop(columns="fold").mean().to_dict()
append(
    results | {"model__model_type": "BERTopic", "dataset": "small", "concepts": False}
)
results