In [1]:
import zipfile
import pandas as pd
import numpy as np
import torch
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten, Dropout,Bidirectional, Layer,Reshape,TimeDistributed
from keras.optimizers import Adam, SGD, RMSprop
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras.regularizers import l2

In [2]:
def process_data(max_length):
    columnns = ["id", "tweet", "subtask_a", "subtask_b", "subtask_c"]

    with zipfile.ZipFile('OlidPreprcessed.zip') as zip_f:
        with zip_f.open('OLID_Tain_ATUSER_URL_EmojiRemoved_Pedro.txt', 'r') as f:
            df = pd.read_csv(f, sep='\t', header=None, names=columnns)

    df.drop(["id", "subtask_b", "subtask_c"], axis=1, inplace=True)
    df['subtask_a'] = df['subtask_a'].map({'OFF': 1, 'NOT': 0})

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['tweet'])
    sequence = tokenizer.texts_to_sequences(df['tweet'])

    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    features = np.array(padded_sequence)
    labels = df['subtask_a'].values
    labels = np.nan_to_num(labels, nan=0)
    return features, labels

def process_test_data(max_length):
    columnns = ["id", "tweet"]

    with zipfile.ZipFile('OlidPreprcessed.zip') as zip_f:
        with zip_f.open('OLID_TEST_A_ATUSER_URL_EmojiRemoved_Pedro.txt', 'r') as f:
            df = pd.read_csv(f, sep='\t', header=None, names=columnns)

    df.drop(["id", "subtask_b", "subtask_c"], axis=1, inplace=True)
    df['subtask_a'] = df['subtask_a'].map({'OFF': 1, 'NOT': 0})

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['tweet'])
    sequence = tokenizer.texts_to_sequences(df['tweet'])

    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    features = np.array(padded_sequence)
    labels = df['subtask_a'].values
    labels = np.nan_to_num(labels, nan=0)
    return features, labels

In [3]:
def create_model(input_shape, optimizer='adam', learning_rate=0.0001, dropout_rate=0.2, units=100):
    l2_coefficient = 0.001
    model = Sequential()

    model.add(Bidirectional(LSTM(units, return_sequences=True), input_shape=input_shape))
    model.add(Bidirectional(LSTM(units, return_sequences=True)))
    model.add(Bidirectional(LSTM(units,kernel_regularizer=l2(l2_coefficient))))

    model.add(Dense(64, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    opt = None
    if optimizer == 'sgd':
        opt = SGD(learning_rate=learning_rate)
    elif optimizer == 'adam':
        opt = Adam(learning_rate=learning_rate,weight_decay=0.01)
    elif optimizer == 'rmsprop':
        opt = RMSprop(learning_rate=learning_rate)

    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [4]:
def train_model(model, X_train, y_train, X_val, y_val, epochs, batch_size,verbosity,model_name):
    early_stopping = EarlyStopping(monitor='val_loss', patience=50, verbose=1)
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val),
                        callbacks=[early_stopping],verbose=verbosity)
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    print(min(train_loss))
    print(min(val_loss))
    # Plot training and validation loss
    epochs = range(1, len(train_loss) + 1)
    plt.plot(epochs, train_loss, 'b', label='Training Loss')
    plt.plot(epochs, val_loss, 'r', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    
    torch.save(model.state_dict(), model_name)

In [5]:
def test_model(model, X_test, y_test,model_name):
    model.load_state_dict(torch.load(model_name))
    loss, accuracy = model.evaluate(X_test, y_test)
    print('Test Loss:', loss)
    print('Test Accuracy:', accuracy)

In [6]:
def grid_search(input_shape,X_train,y_train,X_valid,y_valid,epochs,batch_size):
    param_grid = {

        'units': [50, 150],  # Number of LSTM units
        'dropout_rate': [0.2, 0.4],  # Dropout rate
        'optimizer': ['adam', 'sgd'],  # Optimizer
        'learning_rate': [0.001, 0.01]  # Learning rate
    }

    for opt in param_grid['optimizer']:
        for unit in param_grid['units']:
            for drop in param_grid['dropout_rate']:
                for lr in param_grid['learning_rate']:
                    print(f"Model: {opt} {unit} {drop} {lr}")

                    model = create_model(input_shape=input_shape, units=unit, dropout_rate=drop, learning_rate=lr,
                                         optimizer=opt)

                    train_model(model=model, epochs=epochs, batch_size=batch_size, X_train=X_train, y_train=y_train,
                                X_val=X_valid,
                                y_val=y_valid,verbosity=0)


In [7]:
max_length = 100
vocab_size = 1000
epochs = 300
batch_size = 64

X, y = process_data(max_length)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

X_train_reshaped = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

input_shape = (X_train_reshaped.shape[1], X_train_reshaped.shape[2])

In [None]:
#grid_search(input_shape,X_train,y_train,X_valid,y_valid,epochs,batch_size)
# Best params
# adam 150 0.2 0.001

In [8]:
model = create_model(input_shape=input_shape, units=150, dropout_rate=0.2, learning_rate=0.001,
                     optimizer='adam')
model_name = 'model_adam_0.2_150_0.0001.pth'
train_model(model=model, epochs=epochs, batch_size=batch_size, X_train=X_train, y_train=y_train,
            X_val=X_valid,
            y_val=y_valid,verbosity=1,model_name=model_name)


  super().__init__(**kwargs)


Epoch 1/300
[1m138/145[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m7s[0m 1s/step - accuracy: 0.6525 - loss: 1.0210

KeyboardInterrupt: 

In [None]:
test_model(model,X_test=X_test,y_test=y_test,model_name=model_name)