# RNN with first 200 words
By grace

## preprocess

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim


In [2]:
import json

data2 = []

data1 = []
# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain1_train.json", 'r') as file:
    for line in file:
        data1.append(json.loads(line.strip()))



# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain2_train.json", 'r') as file:
    for line in file:
        data2.append(json.loads(line.strip()))

test = []

# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/test_set.json", 'r') as file:
    for line in file:
        test.append(json.loads(line.strip()))


In [3]:
len(data2)

14900

## RNN build

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

# Parameters
SEQUENCE_LENGTH = 200
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 256
OUTPUT_SIZE = 2  # binary classification
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.001

# Assuming data2 is your dataset
texts = [item['text'] for item in data1]
labels = [item['label'] for item in data1]

# Pad sequences to a consistent length
def pad_sequences(sequences, maxlen=SEQUENCE_LENGTH):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            seq += [5000] * (maxlen - len(seq))
        padded_sequences.append(seq[:maxlen])
    return padded_sequences

texts = pad_sequences(texts)

# Convert to tensors
X = torch.tensor(texts, dtype=torch.long)
y = torch.tensor(labels, dtype=torch.long)
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


In [5]:
X_train.shape

torch.Size([15600, 200])

In [6]:
max([max(seq) for seq in texts])

5000

In [8]:

# Splitting data (80% train and 20% test)

# Define the RNN model
class RNNClassifier(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)
        out = self.fc(rnn_out[:, -1, :])
        return out

# Assuming you have a GPU, otherwise replace 'cuda' with 'cpu'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_token_value = max([max(seq) for seq in texts])

# Create an instance of the RNN and move it to the device
model = RNNClassifier(max_token_value + 1, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training the model
for epoch in range(EPOCHS):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train.to(device))
    loss = criterion(outputs, y_train.to(device))
    loss.backward()
    optimizer.step()

    # Calculate accuracy
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_train.to(device)).sum().item()
    accuracy = correct / len(y_train)

    print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item():.4f}, Accuracy: {accuracy*100:.2f}%')

# Testing the model
model.eval()
with torch.no_grad():
    outputs = model(X_test.to(device))
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_test.to(device)).sum().item()
    accuracy = correct / len(y_test)
    print(f'Test Accuracy: {accuracy * 100:.2f}%')


: 

## optimisation

In [17]:
from torch.utils.data import DataLoader, TensorDataset, random_split

# Prepare data loaders
dataset = TensorDataset(X, y)
train_data, test_data = random_split(dataset, [train_size, len(X) - train_size])
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

class ImprovedRNNClassifier(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, dropout=0.5):
        super(ImprovedRNNClassifier, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, batch_first=True, dropout=dropout)  # using LSTM
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)
        out = self.dropout(rnn_out[:, -1, :])
        out = self.fc(out)
        return out

# Training the model
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct_train = 0
    for batch_texts, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_texts.to(device))
        loss = criterion(outputs, batch_labels.to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # gradient clipping
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_train += (predicted == batch_labels.to(device)).sum().item()
        
    train_accuracy = 100 * correct_train / len(train_loader.dataset)
    avg_loss = total_loss / len(train_loader)

    # Evaluate on the validation set
    model.eval()
    correct_val = 0
    with torch.no_grad():
        for batch_texts, batch_labels in test_loader:
            outputs = model(batch_texts.to(device))
            _, predicted = torch.max(outputs, 1)
            correct_val += (predicted == batch_labels.to(device)).sum().item()
            
    val_accuracy = 100 * correct_val / len(test_loader.dataset)

    print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Val Accuracy: {val_accuracy:.2f}%')



Epoch [1/10], Loss: 0.6984, Train Accuracy: 50.07%, Val Accuracy: 51.28%
Epoch [2/10], Loss: 0.7134, Train Accuracy: 50.44%, Val Accuracy: 51.28%
Epoch [3/10], Loss: 0.7040, Train Accuracy: 49.83%, Val Accuracy: 51.26%
Epoch [4/10], Loss: 0.6978, Train Accuracy: 50.29%, Val Accuracy: 48.72%
Epoch [5/10], Loss: 0.7052, Train Accuracy: 49.04%, Val Accuracy: 48.72%
Epoch [6/10], Loss: 0.6994, Train Accuracy: 49.90%, Val Accuracy: 51.31%
Epoch [7/10], Loss: 0.7054, Train Accuracy: 50.49%, Val Accuracy: 51.26%
Epoch [8/10], Loss: 0.7011, Train Accuracy: 50.38%, Val Accuracy: 51.31%
Epoch [9/10], Loss: 0.7019, Train Accuracy: 50.17%, Val Accuracy: 48.72%
Epoch [10/10], Loss: 0.7018, Train Accuracy: 49.42%, Val Accuracy: 51.31%


## process testing data

In [18]:
test

[{'id': 0,
  'text': [59,
   2,
   3434,
   1013,
   823,
   2,
   887,
   6,
   2375,
   0,
   34,
   43,
   584,
   18,
   0,
   0,
   6,
   686,
   1,
   56,
   43,
   2881,
   1107,
   0,
   287,
   1495,
   9,
   2,
   1013,
   71,
   447,
   2,
   3519,
   0,
   118,
   13,
   10,
   532,
   81,
   1,
   13,
   8,
   15,
   329,
   10,
   0,
   1,
   0,
   1,
   0,
   1,
   5,
   4083,
   1372,
   2938,
   3,
   7,
   13,
   41,
   0,
   120,
   156,
   4752,
   1013,
   1,
   30,
   4083,
   11,
   269,
   661,
   114,
   4,
   2,
   3217,
   6,
   713,
   1,
   944,
   57,
   4,
   0,
   551,
   1,
   39,
   329,
   17,
   0,
   1565,
   0,
   1,
   11,
   240,
   5,
   802,
   6,
   2,
   0,
   1683,
   0,
   6,
   0,
   7,
   6,
   2,
   0,
   6,
   0,
   3,
   74,
   0,
   422,
   22,
   2,
   0,
   0,
   1,
   11,
   10,
   186,
   497,
   842,
   3490,
   144,
   33,
   2,
   0,
   6,
   451,
   3,
   202,
   73,
   11,
   55,
   774,
   608,
   1,
   11,
   32,
   1122,
 

In [19]:
SEQUENCE_LENGTH = 200


# Assuming data2 is your dataset
texts = [item['text'] for item in test]
#labels = [item['label'] for item in test]

# Pad sequences to a consistent length
def pad_sequences(sequences, maxlen=SEQUENCE_LENGTH):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            seq += [5000] * (maxlen - len(seq))
        padded_sequences.append(seq[:maxlen])
    return padded_sequences

texts = pad_sequences(texts)
len(texts[200])

200

In [20]:
# Convert to tensors
X_test = torch.tensor(texts, dtype=torch.long)



In [21]:
model.eval()

# Get predictions
with torch.no_grad():
    logits = model(X_test)

predictions = torch.argmax(logits, dim=1).tolist()

# Create the desired output format
output = []
for idx, pred_class in enumerate(predictions):
    output.append({'id': idx, 'class': pred_class})

# Save to CSV
import csv

with open('../data/results/RNN_dm2_predictions.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'class']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for row in output:
        writer.writerow(row)

## Optimise further

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import json
from sklearn.model_selection import train_test_split

# Load datasets
import json

data2 = []

data1 = []
# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain1_train.json", 'r') as file:
    for line in file:
        data1.append(json.loads(line.strip()))


# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain2_train.json", 'r') as file:
    for line in file:
        data2.append(json.loads(line.strip()))

# Combine both datasets for now
#data = data1 + data2

# Separate text and labels
texts = [item['text'] for item in data2]
labels = [item['label'] for item in data2]

# Oversampling (just duplicating the minority class for now)
human_texts = [item['text'] for item in data2 if item['label'] == 1]
human_labels = [1] * len(human_texts)

texts += human_texts * 5  # Adjust this value to balance the dataset
labels += human_labels * 5

SEQUENCE_LENGTH = 200  # Arbitrary. 
VOCAB_SIZE = 5000
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 256
OUTPUT_SIZE = 2
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.001

# Pad sequences to a consistent length
def pad_sequences(sequences, maxlen=SEQUENCE_LENGTH):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            seq += [5000] * (maxlen - len(seq))  # Using 5000 as padding token
        padded_sequences.append(seq[:maxlen])
    return padded_sequences

texts = pad_sequences(texts)

# Convert to tensors
X = torch.tensor(texts, dtype=torch.long)
y = torch.tensor(labels, dtype=torch.long)

# Split data (80% train and 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

# RNN Model
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding( + 1, embedding_size) # +1 to account for the padding token
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)
        out = self.fc(rnn_out[:, -1, :])
        return out

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNNClassifier(VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    optimizer.zero_grad()
    
    outputs = model(X_train.to(device))
    loss = criterion(outputs, y_train.to(device))
    loss.backward()
    optimizer.step()

    # Calculate accuracy
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_train.to(device)).sum().item()
    accuracy = correct / len(y_train)

    print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item():.4f}, Accuracy: {accuracy*100:.2f}%')

    # Validation
    model.eval()
    val_outputs = model(X_val.to(device))
    _, predicted = torch.max(val_outputs, 1)
    correct = (predicted == y_val.to(device)).sum().item()
    val_accuracy = correct / len(y_val)
    print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item():.4f}, Val Accuracy: {val_accuracy*100:.2f}%')



IndexError: index out of range in self