In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import pandas as pd
from sklearn.utils import shuffle

from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
### Loading Data

df_data_test = pd.read_json('/content/drive/MyDrive/thesis_files/df_data_test_31.05.json', orient='index')
df_data_train = pd.read_json('/content/drive/MyDrive/thesis_files/df_data_train_31.05.json', orient='index')

In [None]:
### creating balanced dataset

shuffle(df_data_train).reset_index(drop=True)
shuffle(df_data_test).reset_index(drop=True)

In [None]:
def create_balanced_dataset(df):
    # Group the DataFrame by the 'label' column
    grouped = df.groupby('label')

    # Find the smallest group size
    min_group_size = min(grouped.size())

    # Sample an equal number of rows from each group
    balanced_df = grouped.apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)

    return balanced_df


df_data_train_balanced = create_balanced_dataset(df_data_train)
df_data_test_balanced = create_balanced_dataset(df_data_test)

In [None]:
# Preparing the training and testing datasets
X_train = df_data_train['snippet']
y_train = df_data_train['label']
X_test = df_data_test['snippet']
y_test = df_data_test['label']

In [None]:
# Preparing the training and testing datasets
#X_train = df_data_train_balanced['snippet']
#y_train = df_data_train_balanced['label']
#X_test = df_data_test_balanced['snippet']
#y_test = df_data_test_balanced['label']

NameError: ignored

In [None]:
### Simple LSTM Classifier ###

max_len = 100
embedding_dim = 100
vocab_size = 10000

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

# Padding
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

# Building the model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

# Compiling
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Compute class weights based on inverse ratio
class_weights = {0: 1, 1: 1 / np.mean(y_train)}

In [None]:
# Compute class weights based on squared root of the ratio

_, counts = np.unique(y_train, return_counts=True)
class_freq = counts / len(y_train)

# Compute class weights based on the square root of the inverse ratio
class_weights = {class_id: np.sqrt(1 / freq) for class_id, freq in enumerate(class_freq)}

class_weights[1] = class_weights[1] * 1

# Print the class weights
print(class_weights)



{0: 1.0087557179561957, 1: 7.606354545280391}


In [None]:
### focal loss
import tensorflow as tf
from tensorflow.keras import backend as K



def focal_loss(alpha=0.25, gamma=2.0):
    def focal_crossentropy(y_true, y_pred):
        """
        Focal Loss for binary classification
        FL(p_t) = -alpha(1 - p_t)^{gamma} * log(p_t)
        where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0, respectively.
        """
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))

        epsilon = K.epsilon()
        # clip to prevent NaN's and Inf's
        pt_1 = K.clip(pt_1, epsilon, 1. - epsilon)
        pt_0 = K.clip(pt_0, epsilon, 1. - epsilon)

        fl1 = - alpha * tf.pow(1. - pt_1, gamma) * tf.math.log(pt_1)
        fl0 = -(1 - alpha) * tf.pow(pt_0, gamma) * tf.math.log(1. - pt_0)

        return K.mean(fl1 + fl0)
    return focal_crossentropy



In [None]:

### Compiling and Training with focal loss
model.compile(loss=focal_loss(alpha=0.75, gamma=2.0), optimizer='adam', metrics=['accuracy'])


### can also add class weights here
model.fit(train_padded, y_train,
                    epochs=10, batch_size=32, class_weight=class_weights,
                    validation_data=(test_padded, y_test),
                    verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fda40094eb0>

In [None]:
# Train the model with class weights
model.fit(train_padded, y_train, epochs=10, batch_size=32, validation_data=(test_padded, y_test), class_weight=class_weights)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f818a385870>

In [None]:
# Training the model
#model.fit(train_padded, y_train, epochs=10, validation_data=(test_padded, y_test))

In [None]:
print("Unique labels in test data:", np.unique(y_test))


Unique labels in test data: [0 1]


In [None]:
y_test.shape

(1529440,)

In [None]:
from sklearn.metrics import classification_report

# Evaluate the model
predictions = model.predict(test_padded)
predictions = np.round(predictions).flatten()

loss, accuracy = model.evaluate(test_padded, y_test, verbose=0)

# Print the metrics
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Print classification report
report = classification_report(y_test, predictions, target_names=['Class 0', 'Class 1'])
print(report)


Test Loss: 0.012734010815620422
Test Accuracy: 0.9817001819610596
              precision    recall  f1-score   support

     Class 0       0.98      1.00      0.99    376911
     Class 1       0.41      0.11      0.18      6699

    accuracy                           0.98    383610
   macro avg       0.70      0.56      0.58    383610
weighted avg       0.97      0.98      0.98    383610

