In [None]:
from pathlib import Path

import json

from classification.bert import run_bert
from classification.nn import preprocess_data
from textbooks.data import Textbook

# Fine-tune model using expert data

In [None]:
from computation.expert import expert_integration

expert_dataset = expert_integration(
    base_textbook=Textbook.from_json(
        Path("textbooks-parsed/2012_Book_ModernMathematicalStatisticsWi.json")
    ),
    other_textbooks=(
        Textbook.from_json(
            Path("textbooks-parsed/Walpole_Probability_and_Statistics.json")
        ),
    ),
).dataset

In [None]:
X, y, _ = run_bert(expert_dataset)
num_classes, X_train, X_test, y_train, y_test = preprocess_data(X, y)

In [None]:
from keras.layers import LSTM, SimpleRNN

from classification.nn import grid_search_neural_networks, reshape, performance_metrics

param_grid = {
    "model__units": [100, 125, 150, 200],
    "model__dropout_rate": [0.4, 0.6, 0.8, 0.9],
    "model__model_type": [LSTM, SimpleRNN],
    "batch_size": [32, 64, 128],
}

best_model = grid_search_neural_networks(
    num_classes=num_classes,
    X_train=reshape(X_train),
    y_train=y_train,
    param_grid=param_grid,
)
best_params = best_model.best_params_

performance_metrics(best_model, X_test, y_test)

# Cross-validation using generated data

In [None]:
BASE_TEXTBOOK = "2012_Book_ModernMathematicalStatisticsWi"

## Generate data

In [None]:
from computation.pipeline import pipeline_integration
from textbooks.utils import extract_content


base_textbook_path = Path(f"textbooks-parsed/{BASE_TEXTBOOK}.json")
base_textbook = Textbook.from_json(base_textbook_path)
other_textbooks = [
    Textbook.from_json(p)
    for p in Path("textbooks-parsed").glob("*")
    if p != base_textbook_path
]

integrated_textbook = pipeline_integration(
    base_textbook,
    other_textbooks,
    tfidf_text_extraction_fns=[extract_content],
    tfidf_threshold=0.6,
    tfidf_uncertain_threshold=0.5,
    d2v_text_extraction_fn=extract_content,
    d2v_threshold=0.5,
    d2v_vector_size=50,
    d2v_min_count=1,
    evaluate=False,
)
dataset = integrated_textbook.dataset

In [None]:
print("Number of topic labels:", len(set(d["topic"] for d in dataset)))
print("Number of data points:", len(dataset))

In [None]:
with open("datasets-new.json", "w") as f:
    json.dump(dataset, f)

## Load data

In [None]:
with open("datasets-new.json", encoding="utf-8") as f:
    dataset = json.load(f)
X, y, textbooks = run_bert(dataset)
num_classes, X_train, X_test, y_train, y_test = preprocess_data(X, y)

## Fine tuning

In [None]:
import numpy as np
from keras.layers import LSTM, SimpleRNN

from classification.grid_search import grid_search_neural_networks
from classification.nn import performance_metrics, reshape

param_grid = {
    "model__units": [100, 125, 150, 200],
    "model__dropout_rate": [0.4, 0.6, 0.8, 0.9],
    "model__model_type": [LSTM, SimpleRNN],
    "batch_size": [32, 64, 128],
}

best_model = grid_search_neural_networks(
    num_classes=num_classes,
    X_train=reshape(X_train),
    y_train=y_train,
    param_grid=param_grid,
)
best_params = best_model.best_params_

performance_metrics(best_model, X_test, y_test)


y_pred_probabilities = best_model.predict(reshape(X_test))
y_pred = np.argmax(y_pred_probabilities, axis=1)

results = performance_metrics(y_pred, y_test)