In [1]:
import json
from pathlib import Path

from classification.bert import run_bert
from classification.nn import preprocess_data
from computation.expert import expert_integration
from computation.pipeline import pipeline_integration
from textbooks.data import Textbook
from textbooks.utils import extract_content

BASE_TEXTBOOK = "2012_Book_ModernMathematicalStatisticsWi"
EXPERT_OTHER_TEXTBOOK = "Walpole_Probability_and_Statistics"

# Load data

In [2]:
expert_dataset = expert_integration(
    base_textbook=Textbook.from_json(Path(f"textbooks-parsed/{BASE_TEXTBOOK}.json")),
    other_textbooks=(
        Textbook.from_json(Path(f"textbooks-parsed/{EXPERT_OTHER_TEXTBOOK}.json")),
    ),
).dataset

print("Number of topic labels:", len(set(d["topic"] for d in expert_dataset)))
print("Number of data points:", len(expert_dataset))

Number of topic labels: 14
Number of data points: 216


In [3]:
integrated_textbook = pipeline_integration(
    base_textbook=Textbook.from_json(Path(f"textbooks-parsed/{BASE_TEXTBOOK}.json")),
    other_textbooks=(
        Textbook.from_json(Path(f"textbooks-parsed/{EXPERT_OTHER_TEXTBOOK}.json")),
    ),
    tfidf_text_extraction_fns=[extract_content],
    tfidf_threshold=0.6,
    tfidf_uncertain_threshold=0.5,
    d2v_text_extraction_fn=extract_content,
    d2v_threshold=0.5,
    d2v_vector_size=50,
    d2v_min_count=1,
    evaluate=False,
)
small_generated_dataset = integrated_textbook.dataset

print("Number of topic labels:", len(set(d["topic"] for d in small_generated_dataset)))
print("Number of data points:", len(small_generated_dataset))

Number of topic labels: 32
Number of data points: 277


In [4]:
base_textbook_path = Path(f"textbooks-parsed/{BASE_TEXTBOOK}.json")
base_textbook = Textbook.from_json(base_textbook_path)
other_textbooks = [
    Textbook.from_json(p)
    for p in Path("textbooks-parsed").glob("*")
    if p != base_textbook_path
]

integrated_textbook = pipeline_integration(
    base_textbook,
    other_textbooks,
    tfidf_text_extraction_fns=[extract_content],
    tfidf_threshold=0.6,
    tfidf_uncertain_threshold=0.5,
    d2v_text_extraction_fn=extract_content,
    d2v_threshold=0.5,
    d2v_vector_size=50,
    d2v_min_count=1,
    evaluate=False,
)
large_generated_dataset = integrated_textbook.dataset

print("Number of topic labels:", len(set(d["topic"] for d in large_generated_dataset)))
print("Number of data points:", len(large_generated_dataset))
with open("dataset.json", "w") as f:
    json.dump(large_generated_dataset, f)

Number of topic labels: 330
Number of data points: 1734


# Fine-tuning & cross-validation

In [5]:
## Fine tuning
import numpy as np
from keras.layers import LSTM, SimpleRNN

from classification.grid_search import grid_search_neural_networks
from classification.nn import reshape
from utils import performance_metrics

param_grid = {
    "model__units": [100, 125, 150, 200],
    "model__dropout_rate": [0.4, 0.6, 0.8, 0.9],
    "model__model_type": [LSTM, SimpleRNN],
    "batch_size": [32, 64, 128],
}



In [6]:
def advanced_language_model_cv(dataset):
    X, y = run_bert(dataset)
    num_classes, X_train, X_test, y_train, y_test = preprocess_data(X, y)

    best_model = grid_search_neural_networks(
        num_classes=num_classes,
        X_train=reshape(X_train),
        y_train=y_train,
        param_grid=param_grid,
    )

    # y_pred_probabilities = best_model.predict(reshape(X_test))
    # y_pred = np.argmax(y_pred_probabilities, axis=1)

    y_pred = best_model.predict(reshape(X_test))

    results = best_model.best_params_ | performance_metrics(y_pred, y_test)
    return results

In [8]:
## Expert data
cv_results_expert = advanced_language_model_cv(expert_dataset)
cv_results_expert

2024-01-23 09:56:02.650810: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-01-23 09:56:02.650864: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-23 09:56:02.650873: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-23 09:56:02.650910: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-23 09:56:02.650924: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-01-23 09:56:06.293107: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-01-23 09:56:06.293138: I metal_plugin/src/device/metal_device.cc:29

Best: 0.57 using {'batch_size': 32, 'model__dropout_rate': 0.6, 'model__model_type': <class 'keras.src.layers.rnn.lstm.LSTM'>, 'model__units': 200}


{'batch_size': 32,
 'model__dropout_rate': 0.6,
 'model__model_type': keras.src.layers.rnn.lstm.LSTM,
 'model__units': 200,
 'accuracy': 0.5681818181818182,
 'precision': 0.48928571428571427,
 'recall': 0.6305860805860806,
 'f1': 0.6265219628855991}

AttributeError: 'dict' object has no attribute 'drop'

In [9]:
## Large generated data
cv_results_large_generated = advanced_language_model_cv(large_generated_dataset)
cv_results_large_generated

2024-01-23 10:07:19.670381: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-01-23 10:07:19.670442: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-23 10:07:19.670454: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-23 10:07:19.670826: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-23 10:07:19.670852: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-01-23 10:07:19.684444: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-01-23 10:07:19.684473: I metal_plugin/src/device/metal_device.cc:29

KeyboardInterrupt: 

In [7]:
## Small generated data
cv_results_small_generated = advanced_language_model_cv(small_generated_dataset)
cv_results_small_generated

2024-01-23 11:03:32.080645: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-01-23 11:03:32.080703: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-23 11:03:32.080721: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-23 11:03:32.080953: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-23 11:03:32.080975: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-01-23 11:03:35.687073: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-01-23 11:03:35.687086: I metal_plugin/src/device/metal_device.cc:11

Best: 0.46 using {'batch_size': 64, 'model__dropout_rate': 0.9, 'model__model_type': <class 'keras.src.layers.rnn.lstm.LSTM'>, 'model__units': 150}


{'batch_size': 64,
 'model__dropout_rate': 0.9,
 'model__model_type': keras.src.layers.rnn.lstm.LSTM,
 'model__units': 150,
 'accuracy': 0.35714285714285715,
 'precision': 0.26904761904761904,
 'recall': 0.27941176470588236,
 'f1': 0.4926400759734093}

# BERTopic Baseline

In [None]:
import pandas as pd
from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

from utils import performance_metrics


def bertopic_cv(dataset, n_splits=5):
    topics = [d["topic"] for d in dataset]
    le = LabelEncoder()
    encoded_topics = le.fit_transform(topics)

    docs = [d["content"] for d in dataset]
    y = encoded_topics

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []

    for train_index, test_index in kf.split(docs):
        X_train, X_test = [docs[i] for i in train_index], [docs[i] for i in test_index]
        y_train, y_test = y[train_index], y[test_index]

        empty_dimensionality_model = BaseDimensionalityReduction()
        clf = LogisticRegression()
        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

        topic_model = BERTopic(
            umap_model=empty_dimensionality_model,
            hdbscan_model=clf,
            ctfidf_model=ctfidf_model,
        )
        topic_model.fit(X_train, y=y_train)
        y_pred, _ = topic_model.transform(X_test)
        result = {"fold": len(results) + 1} | performance_metrics(y_test, y_pred)
        results.append(result)

    return pd.DataFrame(results)

In [None]:
## Expert data
cv_results_expert = bertopic_cv(expert_dataset)
display(cv_results_expert)
print(cv_results_expert.drop(columns="fold").mean())

Unnamed: 0,fold,accuracy,precision,recall,f1
0,1,0.045455,0.029221,0.035256,0.138095
1,2,0.162791,0.111111,0.085034,0.541667
2,3,0.162791,0.123377,0.107143,0.596491
3,4,0.069767,0.082621,0.134615,0.250794
4,5,0.186047,0.112857,0.225,0.40352


accuracy     0.125370
precision    0.091837
recall       0.117410
f1           0.386113
dtype: float64


In [None]:
## Large generated data
cv_results_large_generated = bertopic_cv(large_generated_dataset)
display(cv_results_large_generated)
print(cv_results_large_generated.drop(columns="fold").mean())

Unnamed: 0,fold,accuracy,precision,recall,f1
0,1,0.0,0.0,0.0,
1,2,0.0,0.0,0.0,
2,3,0.00289,0.000701,0.002703,0.041667
3,4,0.0,0.0,0.0,
4,5,0.0,0.0,0.0,


accuracy     0.000578
precision    0.000140
recall       0.000541
f1           0.041667
dtype: float64


In [None]:
## Small generated data
cv_results_small_generated = bertopic_cv(small_generated_dataset)
display(cv_results_small_generated)
print(cv_results_small_generated.drop(columns="fold").mean())

Unnamed: 0,fold,accuracy,precision,recall,f1
0,1,0.017857,0.083333,0.014493,0.5
1,2,0.053571,0.057692,0.057692,0.273016
2,3,0.0,0.0,0.0,
3,4,0.0,0.0,0.0,
4,5,0.072727,0.038213,0.049383,0.209524


accuracy     0.028831
precision    0.035848
recall       0.024314
f1           0.327513
dtype: float64
