In [None]:
import json

import numpy as np

from computation.expert import expert_integration
from computation.pipeline import pipeline_integration
from textbooks.data import Textbook
from textbooks.utils import extract_concept_name, extract_content

BASE_TEXTBOOK = "2012_Book_ModernMathematicalStatisticsWi"
EXPERT_OTHER_TEXTBOOK = "Walpole_Probability_and_Statistics"


best_integration_config = {
    "tfidf_text_extraction_fns": [extract_content, extract_concept_name],
    "tfidf_threshold": 0.6,
    "tfidf_uncertain_threshold": 0.3,
    "d2v_text_extraction_fn": extract_content,
    "d2v_threshold": 0.3,
    "d2v_vector_size": 100,
    "d2v_min_count": 1,
    "evaluate": False,
}

# Load data

In [None]:
expert_dataset = expert_integration(
    base_textbook=Textbook.from_json(f"textbooks-parsed/{BASE_TEXTBOOK}.json"),
    other_textbooks=(
        Textbook.from_json(f"textbooks-parsed/{EXPERT_OTHER_TEXTBOOK}.json"),
    ),
).dataset

print("Number of topic labels:", len(set(d["topic"] for d in expert_dataset)))
print("Number of data points:", len(expert_dataset))

In [None]:
integrated_textbook = pipeline_integration(
    base_textbook=Textbook.from_json(f"textbooks-parsed/{BASE_TEXTBOOK}.json"),
    other_textbooks=(
        Textbook.from_json(f"textbooks-parsed/{EXPERT_OTHER_TEXTBOOK}.json"),
    ),
    **best_integration_config,
)
small_generated_dataset = integrated_textbook.dataset

print("Number of topic labels:", len(set(d["topic"] for d in small_generated_dataset)))
print("Number of data points:", len(small_generated_dataset))

with open("small-dataset.json", "w") as f:
    json.dump(small_generated_dataset, f)

with open("small-dataset.json") as f:
    small_generated_dataset = json.load(f)

In [None]:
from pathlib import Path

base_textbook_path = Path(f"textbooks-parsed/{BASE_TEXTBOOK}.json")
base_textbook = Textbook.from_json(base_textbook_path)
other_textbooks = [
    Textbook.from_json(p)
    for p in Path("textbooks-parsed").glob("*")
    if p != base_textbook_path
]

integrated_textbook = pipeline_integration(
    base_textbook,
    other_textbooks,
    **best_integration_config,
)
large_generated_dataset = integrated_textbook.dataset

print("Number of topic labels:", len(set(d["topic"] for d in large_generated_dataset)))
print("Number of data points:", len(large_generated_dataset))

with open("large-dataset.json", "w") as f:
    json.dump(large_generated_dataset, f)

with open("large-dataset.json") as f:
    large_generated_dataset = json.load(f)

# Fine-tuning & cross-validation

In [None]:
## Fine tuning
from keras.layers import LSTM, SimpleRNN

from classification.validation import advanced_language_model_cv


def grid_search_results_serializer(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    if hasattr(obj, "__name__"):
        return obj.__name__
    raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")


wide_param_grid = {
    "model__units": [100, 125, 150, 200],
    "model__dropout_rate": [0.4, 0.6, 0.8, 0.9],
    "model__model_type": [LSTM, SimpleRNN],
    "batch_size": [32, 64, 128],
}

In [None]:
## Expert data with concepts
results_summary, cv_results = advanced_language_model_cv(
    expert_dataset, wide_param_grid, with_concepts=True
)
with open("results/phase2/expert-with.json", "w") as f:
    json.dump(cv_results, f, default=grid_search_results_serializer)
results_summary

In [None]:
## Expert data without concepts
results_summary, cv_results = advanced_language_model_cv(
    expert_dataset, wide_param_grid, with_concepts=False
)
with open("results/phase2/expert-without.json", "w") as f:
    json.dump(cv_results, f, default=grid_search_results_serializer)
results_summary

In [None]:
## Small generated data with concepts
results_summary, cv_results = advanced_language_model_cv(
    small_generated_dataset, wide_param_grid, with_concepts=True
)
with open("results/phase2/small-with.json", "w") as f:
    json.dump(cv_results, f, default=grid_search_results_serializer)
results_summary

In [None]:
## Small generated data without concepts
results_summary, cv_results = advanced_language_model_cv(
    small_generated_dataset, wide_param_grid, with_concepts=False
)
with open("results/phase2/small-without.json", "w") as f:
    json.dump(cv_results, f, default=grid_search_results_serializer)
results_summary

In [None]:
narrow_param_grid = {
    "model__units": [100, 125, 150],
    "model__dropout_rate": [0.4, 0.6, 0.8, 0.9],
    "model__model_type": [SimpleRNN],
    "batch_size": [32, 64],
}

In [None]:
## Large generated data with concepts
results_summary, cv_results = advanced_language_model_cv(
    large_generated_dataset, narrow_param_grid, with_concepts=True
)
with open("results/phase2/large-with.json", "w") as f:
    json.dump(cv_results, f, default=grid_search_results_serializer)
results_summary

In [None]:
## Large generated data without concepts
results_summary, cv_results = advanced_language_model_cv(
    large_generated_dataset, narrow_param_grid, with_concepts=False
)
with open("results/phase2/large-without.json", "w") as f:
    json.dump(cv_results, f, default=grid_search_results_serializer)
results_summary

# BERTopic Baseline

In [None]:
import pandas as pd
from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

from utils import performance_metrics


def append(data):
    with open("results/phase2/bertopic.jsonl", "a") as f:
        f.write(json.dumps(data, default=grid_search_results_serializer) + "\n")


def bertopic_cv(dataset, with_concepts, n_splits=5):
    topics = [d["topic"] for d in dataset]
    le = LabelEncoder()
    encoded_topics = le.fit_transform(topics)

    docs = [
        f"{d['concepts']} {d['content']}" if with_concepts else d["content"]
        for d in dataset
    ]
    y = encoded_topics

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []

    for train_index, test_index in kf.split(docs):
        X_train, X_test = [docs[i] for i in train_index], [docs[i] for i in test_index]
        y_train, y_test = y[train_index], y[test_index]

        empty_dimensionality_model = BaseDimensionalityReduction()
        clf = LogisticRegression()
        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

        topic_model = BERTopic(
            umap_model=empty_dimensionality_model,
            hdbscan_model=clf,
            ctfidf_model=ctfidf_model,
        )
        topic_model.fit(X_train, y=y_train)
        y_pred, _ = topic_model.transform(X_test)
        results.append(performance_metrics(y_test, y_pred))

    return pd.DataFrame(results)

In [None]:
## Expert data with concepts
cv_results_expert = bertopic_cv(expert_dataset, with_concepts=True)
results_summary = cv_results_expert.mean().to_dict()
append(
    results_summary | {"model__model_type": "BERTopic", "dataset": "expert", "concepts": True}
)
results_summary

In [None]:
## Expert data without concepts
cv_results_expert = bertopic_cv(expert_dataset, with_concepts=False)
results_summary = cv_results_expert.mean().to_dict()
append(
    results_summary | {"model__model_type": "BERTopic", "dataset": "expert", "concepts": False}
)
results_summary

In [None]:
## Large generated data with concepts
cv_results_large = bertopic_cv(large_generated_dataset, with_concepts=True)
results_summary = cv_results_large.mean().to_dict()
append(
    results_summary | {"model__model_type": "BERTopic", "dataset": "large", "concepts": True}
)
results_summary

In [None]:
## Large generated data without concepts
cv_results_large = bertopic_cv(large_generated_dataset, with_concepts=False)
results_summary = cv_results_large.mean().to_dict()
append(
    results_summary | {"model__model_type": "BERTopic", "dataset": "large", "concepts": False}
)
results_summary

In [None]:
## Small generated data with concepts
cv_results_small = bertopic_cv(small_generated_dataset, with_concepts=True)
results_summary = cv_results_small.mean().to_dict()
append(
    results_summary | {"model__model_type": "BERTopic", "dataset": "small", "concepts": True}
)
results_summary

In [None]:
## Small generated data without concepts
cv_results_small_generated = bertopic_cv(small_generated_dataset, with_concepts=False)
results_summary = cv_results_small.mean().to_dict()
append(
    results_summary | {"model__model_type": "BERTopic", "dataset": "small", "concepts": False}
)
results_summary