In [None]:

import pandas as pd
import time
import numpy as np
import tensorflow as tf
import re
import matplotlib.pyplot as plt
import nltk
import string

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from tensorflow.python.ops.resource_variable_ops import var_handle_op
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
from transformers import AutoTokenizer, AutoModel, TFAutoModel
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold

nltk.download('stopwords') # download stopwords corpus
nltk.download('punkt') # download punkt tokenizer




In [None]:

# Load the data
df = pd.read_csv('') # INPUT DATA FILE
X = df['content']
y = df['entropy']

# split train dataset into train, validation and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Plot the distribution of entropy scores in the test set
plt.figure(figsize=(8, 6))
plt.hist(y_train, bins=20, edgecolor='black', color='grey', alpha=0.75)
plt.xlabel('Entropy')
plt.ylabel('Frequency')
plt.title('Language model: Distribution of Entropy Scores in Train Set')
plt.show()

plt.figure(figsize=(8, 6))
plt.hist(y_test, bins=20, edgecolor='black', color='grey', alpha=0.75)
plt.xlabel('Entropy')
plt.ylabel('Frequency')
plt.title('Language model: Distribution of Entropy Scores in Test Set')
plt.show()


# BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

# Tokenize and encode the text data
train_tokens = tokenizer.batch_encode_plus(
    X_train.values.tolist(),
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='tf'
)

# Tokenize and encode the text data
val_tokens = tokenizer.batch_encode_plus(
    X_val.values.tolist(),
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='tf'
)

test_tokens = tokenizer.batch_encode_plus(
    X_test.values.tolist(),
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='tf'
)

# Convert the tokenized input to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_tokens),
    y_train.values
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_tokens),
    y_val.values
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_tokens),
    y_test.values
))

# Prepare the model input
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')

# Load BERT model
model = TFAutoModel.from_pretrained("GroNLP/bert-base-dutch-cased")  # Tensorflow

# Freeze BERT layers
model.trainable = False

# Retrieve the BERT embeddings
embeddings = model(input_ids,attention_mask)[0]

# Perform pooling (average pooling in this case)
pooled_output = tf.keras.layers.GlobalAveragePooling1D()(embeddings)
pooled_output = tf.keras.layers.Dropout(0.1)(pooled_output)
pooled_output = tf.keras.layers.Dense(32, activation='relu')(pooled_output)
pooled_output = tf.keras.layers.Dropout(0.1)(pooled_output)

# Dense layer for regression output
output = tf.keras.layers.Dense(1, activation='linear')(pooled_output)

# Build the model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile the model, learning rate: 1e-4
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4), loss='mean_squared_error')

model.summary()

training = model.fit(train_dataset.batch(256), epochs=10, shuffle=True, verbose=0, validation_data=val_dataset.batch(256))

# Evaluate the best model on the test set
predictions = model.predict(test_dataset.batch(32))
mse = mean_squared_error(y_test.values, predictions)
print('Best Model: Mean Squared Error:', mse)
#print('Best Hyperparameters:', best_params)

# Print the content, entropy, and predicted entropy (examples)
for content, entropy, pred in zip(X_test, y_test.values, predictions.flatten()):
    print(f"Content: {content}")
    print(f"Entropy: {entropy}")
    print(f"Predicted Entropy: {pred}")
    print()

# Calculate the errors
errors = y_test.values - predictions.flatten()

# Calculate the variance and standard deviation of errors
var = np.var(errors)
std = np.std(errors)
print('Language Model: Variance of Errors:', var)
print('Language Model: Standard Deviation of Errors:', std)


# Plot the distribution of errors
plt.figure(figsize=(8, 6))
plt.hist(errors, bins=20, edgecolor='black', alpha=0.75)
plt.xlabel('Error')
plt.ylabel('Frequency')
plt.title('Language Model: Distribution of Errors in Test Set')
plt.show()


 Resources:
 https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794
