In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras import Model


In [None]:
# Function to load data with different encodings
def load_data(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Failed to decode the file with tried encodings.")

In [None]:
# Read the CSV file
df = load_data('df_all_group3.csv')

In [None]:
df.columns = ["id", "label", "text", "group"]

In [None]:
df

In [None]:
df_y = pd.get_dummies(df.label, dtype=int)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Initialize lists to store the split data
X_train_list = []
X_test_list = []
X_eval_list = []
y_train_list = []
y_test_list = []
y_eval_list = []

# Split data by label
for label in df['label'].unique():
    label_data = df[df.label == label]
    label_target = df_y[df.label == label]

    # Split 60% for training
    train, temp, train_target, temp_target = train_test_split(
        label_data, 
        label_target, 
        train_size=0.6, 
        random_state=42
    )
    
    # Split remaining 40% into 20% test and 20% eval
    test, eval, test_target, eval_target = train_test_split(
        temp, 
        temp_target, 
        test_size=0.5, 
        random_state=42
    )

    X_train_list.append(train)
    X_test_list.append(test)
    X_eval_list.append(eval)
    y_train_list.append(train_target)
    y_test_list.append(test_target)
    y_eval_list.append(eval_target)

# Concatenate and shuffle training data
X_train = pd.concat(X_train_list).sample(frac=1, random_state=10)
y_train = pd.concat(y_train_list).loc[X_train.index]

# Concatenate test and eval data
X_test = pd.concat(X_test_list)
y_test = pd.concat(y_test_list)
X_eval = pd.concat(X_eval_list)
y_eval = pd.concat(y_eval_list)

# Ensure eval set has equal representation of each label, if necessary
X_eval = (X_eval
          .groupby('label', group_keys=False)
          .apply(lambda x: x.sample(n=min(len(x), 50), random_state=10, replace=True)))
y_eval = df_y.loc[X_eval.index]

# Reset index for the final datasets
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
X_eval.reset_index(drop=True, inplace=True)
y_eval.reset_index(drop=True, inplace=True)


In [None]:
max_words = 10_000
max_len = 128

def tokenize_pad_sequences(text, tokenizer=None):
    '''
    This function tokenize the input text into sequences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
        tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, tokenizer


tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
tokenizer.fit_on_texts(X_train.text)


X_train, tokenizer = tokenize_pad_sequences(X_train.text)
X_eval, _ = tokenize_pad_sequences(X_eval.text,tokenizer)
X_test, _ = tokenize_pad_sequences(X_test.text,tokenizer)

In [None]:
vocab_size = 10_000
embedding_size = 64
max_len = 128

inputs = Input(shape=(max_len,))
embedding_layer = Embedding(vocab_size, embedding_size, input_length=max_len)(inputs)
conv1d_layer = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(embedding_layer)
maxpooling_layer = MaxPooling1D(pool_size=2)(conv1d_layer)
bidirectional_lstm_layer = Bidirectional(LSTM(32))(maxpooling_layer)
dropout_layer = Dropout(0.4)(bidirectional_lstm_layer)
outputs = Dense(4, activation='softmax')(dropout_layer) 

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
print(model.summary())

batch_size = 32
epochs = 100

early_stop = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

history = model.fit(X_train, y_train,
                    validation_data=(X_eval, y_eval),
                    batch_size=batch_size, epochs=epochs, verbose=1,
                    shuffle=True,
                    callbacks=[early_stop])

In [None]:
y_pred = model.predict(X_test)

In [None]:
def evaluate(y_true, y_pred):
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2, 3])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [None]:
evaluate(y_true=np.argmax(y_test, axis=1), y_pred=np.argmax(y_pred, axis=1))