# Generate embeddings for each section

In [None]:
from pathlib import Path

from computations.doc2vec import doc2vec_integration
from computations.expert import expert_integration
from textbooks.data import Textbook
from textbooks.utils import extract_content

base_textbook = Textbook.from_json(
    Path("textbooks-parsed/2012_Book_ModernMathematicalStatisticsWi.json")
)
other_textbooks = [
    Textbook.from_json(Path("textbooks-parsed/Walpole_Probability_and_Statistics.json"))
]

integrated_textbook = doc2vec_integration(
    base_textbook,
    other_textbooks,
    text_extraction_fn=extract_content,
    threshold=0.4,
    vector_size=100,
    min_count=1,
    epochs=40,
    iterative=False,
    evaluate=False,
)

expert_it = expert_integration(base_textbook, other_textbooks)

In [None]:
from classification.bert import run_bert

run_bert(integrated_textbook, "bert-vectors/doc2vec.json")
run_bert(expert_it, "bert-vectors/expert.json")

# Find best model using expert data

In [None]:
import json
import tensorflow as tf

from classification.preprocess import preprocess_data


def load_vectors(filename):
    with open(filename) as f:
        vectors = json.load(f)

    X = tf.convert_to_tensor([v["x"] for v in vectors])
    y = tf.convert_to_tensor([v["y"] for v in vectors])

    return X, y


X, y = load_vectors("bert-vectors/expert.json")
num_classes, X_train, X_test, y_train, y_test = preprocess_data(X, y)

In [None]:
from keras.layers import LSTM, SimpleRNN

from classification.neural_nets import grid_search_neural_networks


def reshape(array):
    return array.reshape(-1, 1, array.shape[-1])


param_grid = {
    "model__units": [100, 125, 150, 200],
    "model__dropout_rate": [0.4, 0.6, 0.8, 0.9],
    "model__model_type": [LSTM, SimpleRNN],
    "batch_size": [32, 64, 128],
}

expert_model = grid_search_neural_networks(
    num_classes=num_classes,
    X_train=reshape(X_train),
    y_train=y_train,
    param_grid=param_grid,
)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def test(model):
    y_pred = model.predict(reshape(X_test))

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")
    f1 = f1_score(y_test, y_pred, average="micro")

    # Print the results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")


test(expert_model)

# Find best model using computer-generated data

In [None]:
X, y = load_vectors("bert-vectors/doc2vec.json")
num_classes, X_train, X_test, y_train, y_test = preprocess_data(X, y)

computer_model = grid_search_neural_networks(
    num_classes=num_classes,
    X_train=reshape(X_train),
    y_train=y_train,
    param_grid=param_grid,
)

In [None]:
test(computer_model)