In [None]:
import h5py
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
#from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import pandas as pd

from keras.layers import Masking
# Import required layers and modules
from keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Bidirectional
from keras.models import Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
### loading the full dataset ###

combined_df_wordlabelled = pd.read_json('/content/drive/MyDrive/thesis_files/combined_df_wordlabelled_1706.json', orient='index')

In [None]:
### loading the medium set ###

combined_df_wordlabelled = pd.read_json('/content/drive/MyDrive/thesis_files/medium_second_wordlabelled_0407.json', orient='index')



In [None]:

def split_documents(dataset, n):
    new_dataset = []
    for document in dataset:
        for i in range(0, len(document), n):
            new_document = document[i:i+n]
            new_dataset.append(new_document)
    return new_dataset

In [None]:
combined_df_seqlabelled_word_short = split_documents(combined_df_wordlabelled['tagged_text'], 512)

In [None]:
len(combined_df_seqlabelled_word_short)

99115

In [None]:
len(max(combined_df_seqlabelled_word_short, key=len))

512

In [None]:
tagged_data = combined_df_seqlabelled_word_short

In [None]:
def rename_tags(tagged_data):
    renamed_data = []

    for sequence in tagged_data:
        new_sequence = []
        for word, tag in sequence:
            #print(word, tag)
            if tag == 'non-highlight':
                new_sequence.append((word, 'normaltext'))
            else:
                new_sequence.append((word, tag))
        renamed_data.append(new_sequence)

    return renamed_data

In [None]:
tagged_data = rename_tags(tagged_data)

In [None]:
max_length = max([len(seq) for seq in tagged_data])
print(f"Maximum sequence length: {max_length}")

Maximum sequence length: 512


In [None]:
full_data = tagged_data

In [None]:
tagged_data = tagged_data[:500]

In [None]:
tagged_data = full_data

In [None]:
len(tagged_data)

11551

In [None]:
print(tagged_data[10])

[('Top', 'normaltext'), ('highlight', 'normaltext'), ('Photo', 'normaltext'), ('by', 'normaltext'), ('Engin', 'normaltext'), ('Akyurt', 'normaltext'), ('from', 'normaltext'), ('Pexels', 'normaltext'), ('Facing', 'normaltext'), ('Three', 'normaltext'), ('Fundamental', 'normaltext'), ('Fears', 'normaltext'), ('About', 'normaltext'), ('the', 'normaltext'), ('Coronavirus', 'normaltext'), ('A', 'normaltext'), ('primer', 'normaltext'), ('on', 'normaltext'), ('the', 'normaltext'), ('most', 'normaltext'), ('essential', 'normaltext'), (',', 'normaltext'), ('but', 'normaltext'), ('often', 'normaltext'), ('misunderstood', 'normaltext'), (',', 'normaltext'), ('aspects', 'normaltext'), ('of', 'normaltext'), ('Covid-19', 'normaltext'), ('Bo', 'normaltext'), ('Stapler', 'normaltext'), (',', 'normaltext'), ('MD', 'normaltext'), ('·', 'normaltext'), ('Follow', 'normaltext'), ('Published', 'normaltext'), ('in', 'normaltext'), ('Microbial', 'normaltext'), ('Instincts', 'normaltext'), ('·', 'normaltext'),

In [None]:
### seperate tokenization ###


def chunker(seq, size):
    """Divide a sequence into chunks of the given size."""
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

# Split your tagged_data into a training set and a test set
split_index = int(len(tagged_data) * 0.8)
train_data = tagged_data[:split_index]
test_data = tagged_data[split_index:]

# Get the unique words and labels in the training data
words_train = list(set(token for seq in train_data for token, label in seq))
tags_train = list(set(label for seq in train_data for token, label in seq))

# Tokenize words
word_tokenizer_train = Tokenizer(filters='', lower=False, oov_token='OOV')
word_tokenizer_train.fit_on_texts(words_train)

# Tokenize words for test data
word_tokenizer_test = Tokenizer(filters='', lower=False, oov_token='OOV')
word_tokenizer_test.fit_on_texts(words_train)  # Note: Fit on words from training data only!

# Tokenize labels
label_tokenizer = Tokenizer(filters='', lower=False)
label_tokenizer.word_index = {'normaltext': 1, 'highlight': 2}

def process_chunk(chunk, max_sequence_length, word_tokenizer):
    X_chunk = []
    y_chunk = []

    for seq in chunk:
        document_words = []
        document_labels = []

        for word, label in seq:
            # Transform each word to a numeric ID
            encoded_word = word_tokenizer.word_index.get(word, word_tokenizer.word_index['OOV'])
            document_words.append(encoded_word)

            # Encode the label for the word
            encoded_label = label_tokenizer.word_index[label]
            document_labels.append(encoded_label)

        # Pad the document_words and document_labels to max_sequence_length
        document_words = pad_sequences([document_words], maxlen=max_sequence_length, padding='post')[0]
        document_labels = pad_sequences([document_labels], maxlen=max_sequence_length, padding='post')[0]

        X_chunk.append(document_words)
        y_chunk.append(document_labels)

    return X_chunk, y_chunk

# Define max_sequence_length
max_sequence_length = 512

# Process chunks for train and test data
X_train = []
y_train = []

for chunk in chunker(train_data, 1000):  # Choose chunk size based on your memory capacity
    X_chunk, y_chunk = process_chunk(chunk, max_sequence_length, word_tokenizer_train)
    X_train.extend(X_chunk)
    y_train.extend(y_chunk)

X_test = []
y_test = []

for chunk in chunker(test_data, 1000):  # Choose chunk size based on your memory capacity
    X_chunk, y_chunk = process_chunk(chunk, max_sequence_length, word_tokenizer_test)
    X_test.extend(X_chunk)
    y_test.extend(y_chunk)

# Convert lists to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)


In [None]:
print(X_train.shape)

(9240, 512)


In [None]:
folder_path = "/content/drive/MyDrive/thesis_files/"

In [None]:
import h5py
import numpy as np



# Save X_train data
with h5py.File(f'{folder_path}X_train_data_wordlabel_26.h5', 'w') as hf:
    hf.create_dataset("X_train_data",  data=np.array(X_train))

# Save y_train data
with h5py.File(f'{folder_path}y_train_data_wordlabel_26.h5', 'w') as hf:
    hf.create_dataset("y_train_data",  data=np.array(y_train))

# Save X_test data
with h5py.File(f'{folder_path}X_test_data_wordlabel_26.h5', 'w') as hf:
    hf.create_dataset("X_test_data",  data=np.array(X_test))

# Save y_test data
with h5py.File(f'{folder_path}y_test_data_wordlabel_26.h5', 'w') as hf:
    hf.create_dataset("y_test_data",  data=np.array(y_test))

NameError: ignored

In [None]:
import h5py
import numpy as np


# Load X_train data
with h5py.File(f'{folder_path}X_train_data_wordlabel_26.h5', 'r') as hf:
    X_train = np.array(hf['X_train_data'])

# Load y_train data
with h5py.File(f'{folder_path}y_train_data_wordlabel_26.h5', 'r') as hf:
    y_train = np.array(hf['y_train_data'])

# Load X_test data
with h5py.File(f'{folder_path}X_test_data_wordlabel_26.h5', 'r') as hf:
    X_test = np.array(hf['X_test_data'])

# Load y_test data
with h5py.File(f'{folder_path}y_test_data_wordlabel_26.h5', 'r') as hf:
    y_test = np.array(hf['y_test_data'])

KeyError: ignored

In [None]:
X_train.shape

(79292, 512)

In [None]:
### save the sentence-labelled data ###

import h5py

# Specify the path to the folder where you want to save the files


# Save X data
with h5py.File(f'{folder_path}X_data_wordlabel_20.h5', 'w') as hf:
    hf.create_dataset("X_data",  data=np.array(X))

# Save y data
with h5py.File(f'{folder_path}y_data_wordlabel_20.h5', 'w') as hf:
    hf.create_dataset("y_data",  data=np.array(y))


In [None]:
# Load X data
import h5py

with h5py.File(f'{folder_path}X_data_wordlabel_20.h5', 'r') as hf:
    X = hf['X_data'][:]

# Load y data
with h5py.File(f'{folder_path}y_data_wordlabel_20.h5', 'r') as hf:
    y = hf['y_data'][:]


In [None]:
print(X.shape)

AttributeError: ignored

In [None]:
print(len(word_tokenizer_train.word_index))

902788


In [None]:
len_word_index = 902788 # 104385 #23031 # 26892 # 974442 #len(words)
print(len_word_index)

902788


In [None]:
len(words)

26892

In [None]:
len_tag_index = 3 # len(tags) +1
print(len_tag_index)

3


In [None]:
len(tagged_data)

500

In [None]:
import numpy as np

# Flatten the nested list:
flattened = np.array(X_train).flatten()

max_index = np.max(flattened)

print("Maximum index in training data: ", max_index)
print("Length of word index: ", len_word_index)


Maximum index in training data:  902789
Length of word index:  902788


In [None]:
# Define the structure of the model
document_input = Input(shape=(None,), dtype='int32')  # shape = (num_documents, num_words)
embedded_sequences = Embedding(input_dim=len_word_index +2, output_dim=50)(document_input)  # embedding at word level ## watch out for +1 or +2 here
masked_sequences = Masking(mask_value=0)(embedded_sequences)  # masking at word level
word_lstm = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(masked_sequences)  # LSTM at word level, returning sequences
word_output = Dense(len_tag_index, activation='softmax')(word_lstm)  # word-level output

# Create the model
model = Model(inputs=[document_input], outputs=[word_output])


In [None]:
# Compute sample weights
unique, counts = np.unique([label for document in y_train for label in document], return_counts=True)
weights = len(y_train) / (len(unique) * counts)

# Adjust the weight for the minority class (highlight class)
weights = weights * len(unique)  # normalization to make minority class weight >= 1
weights_dict = {class_id: weight for class_id, weight in zip(unique, weights)}

# Increase weight for the highlight class (class 2)
weights_dict[2] = weights_dict[2] * 1  # You can adjust this value based on your requirements

# Set the weight of the padding class to 0
weights_dict[0] = 0

# Create a sample weights matrix and assign a weight to each word in each document
sample_weights = []

for document_labels in y_train:
    document_weights = [weights_dict[label] for label in document_labels.tolist()]
    sample_weights.append(document_weights)


# Make sure to convert sample_weights to a numpy array
sample_weights = np.array(sample_weights)


In [None]:
print(weights_dict)

{0: 0, 1: 0.0020669436685382694, 2: 0.18310717815608862}


In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Convert y and sample_weights to numpy arrays and expand their last dimensions
#y = np.array(y)
y_train = np.expand_dims(y_train, -1)  # model expects 3D array for y
y_test = np.expand_dims(y_test, -1) # check if this is needed

#sample_weights = np.array(sample_weights)
sample_weights = np.expand_dims(sample_weights, -1)  # model expects 3D array for sample_weights




In [None]:
print(X_train.shape)
print(y_train.shape)
print(sample_weights.shape)


AttributeError: ignored

In [None]:
# Train the model
history = model.fit(X_train, y_train,
                    sample_weight=sample_weights,
                    epochs=3, batch_size=128,
                    validation_split=0.2)

Epoch 1/3



Epoch 2/3



Epoch 3/3





In [None]:
# Save the model
model_name = 'model_word_27_1'
full_path = folder_path + model_name
model.save(full_path)

In [None]:
from sklearn.metrics import classification_report

# Predict on test set
y_pred = model.predict(X_test)

# Since our outputs are softmax probabilities, we need to choose the class with the highest probability
y_pred_classes = np.argmax(y_pred, axis=-1)

# Flatten y_test and y_pred_classes to be compatible with classification_report
y_test_flatten = [label for doc in y_test for label in doc]
y_pred_classes_flatten = [pred for doc in y_pred_classes for pred in doc]

# We also need to remove padding (class 0) predictions for the classification report
y_test_no_padding = []
y_pred_no_padding = []

for true, pred in zip(y_test_flatten, y_pred_classes_flatten):
    if true != 0:  # Exclude padding class
        y_test_no_padding.append(true)
        y_pred_no_padding.append(pred)

# Print classification report
print(classification_report(y_test_no_padding, y_pred_no_padding, target_names=['normaltext', 'highlight']))


              precision    recall  f1-score   support

  normaltext       0.99      0.42      0.59   9267984
   highlight       0.02      0.74      0.04    155322

    accuracy                           0.43   9423306
   macro avg       0.51      0.58      0.32   9423306
weighted avg       0.97      0.43      0.58   9423306



In [None]:
# check the maximum index in the X_test
max_index = np.max(X_test)
print(f'Max index in X_test: {max_index}')

# check the size of the word index
print(f'Size of word index: {len_word_index}')

# if max_index >= len_word_index, then you have a problem
if max_index >= len_word_index:
    print("Problem detected: some word indices in X_test exceed the size of the word index.")


Max index in X_test: 974443
Size of word index: 974442
Problem detected: some word indices in X_test exceed the size of the word index.


In [None]:
# get the document and word where the error occurred
doc_index, word_index = 15, 145
doc = X_test[doc_index]
word = doc[word_index]
print(f'Problematic word index: {word}')

# see if this word is in the word_index
for key, value in word_index.items():
    if value == word:
        print(f'Word: {key}')


Problematic word index: 967415


AttributeError: ignored

In [None]:
# get the document and word where the error occurred
doc_index, index = 15, 145
doc = X_test[doc_index]
word = doc[index]
print(f'Problematic word index: {word}')

# see if this word is in the word_index
for key, value in word_index.items():
    if value == word:
        print(f'Word: {key}')


Problematic word index: 967415


AttributeError: ignored