In [8]:
import pandas as pd
import time
import numpy as np
import tensorflow as tf
import re
import matplotlib.pyplot as plt
import nltk
import string

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from tensorflow.python.ops.resource_variable_ops import var_handle_op
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
from transformers import AutoTokenizer, AutoModel, TFAutoModel
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords') # download stopwords corpus
nltk.download('punkt') # download punkt tokenizer




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Hannah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Hannah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
%cd /Users/Hannah/Documents/VU/Msc/Thesis/Coding/Pipeline

/Users/Hannah/Documents/VU/Msc/Thesis/Coding/Pipeline


In [11]:


# Load the data
df = pd.read_csv('Indicator-Desc_DataNewVanPipeline.csv')
X = df['content']
y = df['entropy']

# split train dataset into train, validation and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

#X_train, X_test, y_train, y_test = train_test_split(X , y, train_size=0.8)

# BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

# Tokenize and encode the text data
train_tokens = tokenizer.batch_encode_plus(
    X_train.values.tolist(),
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='tf'
)

# Tokenize and encode the text data
val_tokens = tokenizer.batch_encode_plus(
    X_val.values.tolist(),
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='tf'
)

test_tokens = tokenizer.batch_encode_plus(
    X_test.values.tolist(),
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='tf'
)

# Convert the tokenized input to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_tokens),
    y_train.values
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_tokens),
    y_val.values
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_tokens),
    y_test.values
))

# Prepare the model input
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')

# Load BERT model
model = TFAutoModel.from_pretrained("GroNLP/bert-base-dutch-cased")  # Tensorflow

# Freeze BERT layers
model.trainable = False

# Retrieve the BERT embeddings
embeddings = model(input_ids,attention_mask)[0]

# Perform pooling (average pooling in this case)
pooled_output = tf.keras.layers.GlobalAveragePooling1D()(embeddings)
pooled_output = tf.keras.layers.Dropout(0.1)(pooled_output)
pooled_output = tf.keras.layers.Dense(64, activation='relu')(pooled_output)
pooled_output = tf.keras.layers.Dropout(0.1)(pooled_output)

# Dense layer for regression output
output = tf.keras.layers.Dense(1, activation='linear')(pooled_output)

# Build the model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile the model, learning rate: 1e-4
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4), loss='mean_squared_error')
model.summary()
training = model.fit(train_dataset.batch(256), epochs=10, shuffle=True, verbose=0, validation_data=val_dataset.batch(256))


param_grid = {
    'learning_rate': [1e-4, 5e-4],
    'dropout_rate': [0.1, 0.2],
    'epochs': [5, 10]
}

# Wrap the model with KerasRegressor for grid search
keras_regressor = tf.keras.wrappers.scikit_learn.KerasRegressor(model)

# Convert TensorFlow datasets to NumPy arrays
train_data = np.array(list(train_dataset.batch(256).as_numpy_iterator()))
y_train_data = np.array(list(y_train.values))

# Perform grid search using GridSearchCV
grid_search = GridSearchCV(estimator=keras_regressor, param_grid=param_grid, cv=3)

# Fit the grid search on the training data
grid_search.fit(train_data, y_train_data)

# Fit the grid search on the training data

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the best model on the test set
predictions = best_model.predict(test_dataset.batch(32))
mse = mean_squared_error(y_test.values, predictions)
print('Best Model: Mean Squared Error:', mse)
print('Best Hyperparameters:', best_params)


# Calculate the errors
errors = y_test.values - predictions.flatten()

# Calculate the variance and standard deviation of errors
var = np.var(errors)
std = np.std(errors)
print('Language Model: Variance of Errors:', var)
print('Language Model: Standard Deviation of Errors:', std)

# Plot the distribution of errors
plt.figure(figsize=(8, 6))
plt.hist(errors, bins=20, edgecolor='black', alpha=0.75)
plt.xlabel('Error')
plt.ylabel('Frequency')
plt.title('Language Model: Distribution of Errors in Test Set')
plt.show()


Some layers from the model checkpoint at GroNLP/bert-base-dutch-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert/pooler/dense/kernel:0', 'bert/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model_4 (TFBertModel)  TFBaseModelOutputWi  109137408   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

2023-06-11 20:19:17.527289: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype double and shape [4629]
	 [[{{node Placeholder/_3}}]]
  inputs = self._flatten_to_reference_inputs(inputs)
2023-06-11 20:34:02.592711: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype double and shape [992]
	 [[{{node Placeholder/_3}}]]
  keras_regressor = tf.keras.wrappers.scikit_learn.KerasRegressor(model)
2023-06-11 22:59:58.612829: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore thi

ValueError: Found input variables with inconsistent numbers of samples: [19, 4629]

 Resources:
 https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794
