# RNN with first 200 words
By grace

## preprocess

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim


In [32]:
import json

data2 = []

# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain1_train.json", 'r') as file:
    for line in file:
        data2.append(json.loads(line.strip()))



# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain2_train.json", 'r') as file:
    for line in file:
        data2.append(json.loads(line.strip()))

test = []

# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/test_set.json", 'r') as file:
    for line in file:
        test.append(json.loads(line.strip()))


## RNN build

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim

# Parameters
SEQUENCE_LENGTH = 200
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 256
OUTPUT_SIZE = 2  # binary classification
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.001

# Assuming data2 is your dataset
texts = [item['text'] for item in data2]
labels = [item['label'] for item in data2]

# Pad sequences to a consistent length
def pad_sequences(sequences, maxlen=SEQUENCE_LENGTH):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            seq += [0] * (maxlen - len(seq))
        padded_sequences.append(seq[:maxlen])
    return padded_sequences

texts = pad_sequences(texts)

# Convert to tensors
X = torch.tensor(texts, dtype=torch.long)
y = torch.tensor(labels, dtype=torch.long)

# Splitting data (80% train and 20% test)
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Define the RNN model
class RNNClassifier(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)
        out = self.fc(rnn_out[:, -1, :])
        return out

# Assuming you have a GPU, otherwise replace 'cuda' with 'cpu'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_token_value = max([max(seq) for seq in texts])

# Create an instance of the RNN and move it to the device
model = RNNClassifier(max_token_value + 1, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training the model
for epoch in range(EPOCHS):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train.to(device))
    loss = criterion(outputs, y_train.to(device))
    loss.backward()
    optimizer.step()

    # Calculate accuracy
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_train.to(device)).sum().item()
    accuracy = correct / len(y_train)

    print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item():.4f}, Accuracy: {accuracy*100:.2f}%')

# Testing the model
model.eval()
with torch.no_grad():
    outputs = model(X_test.to(device))
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_test.to(device)).sum().item()
    accuracy = correct / len(y_test)
    print(f'Test Accuracy: {accuracy * 100:.2f}%')


Epoch [1/10], Loss: 0.6874, Accuracy: 55.96%
Epoch [2/10], Loss: 1.0889, Accuracy: 56.12%
Epoch [3/10], Loss: 0.7308, Accuracy: 56.08%
Epoch [4/10], Loss: 0.7617, Accuracy: 44.11%
Epoch [5/10], Loss: 0.8252, Accuracy: 44.04%
Epoch [6/10], Loss: 0.7520, Accuracy: 44.12%
Epoch [7/10], Loss: 0.6893, Accuracy: 56.03%
Epoch [8/10], Loss: 0.7012, Accuracy: 56.20%
Epoch [9/10], Loss: 0.7342, Accuracy: 56.24%


In [None]:
model

RNNClassifier(
  (embedding): Embedding(5000, 128)
  (rnn): RNN(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

## process testing data

In [None]:
test

[{'id': 0,
  'text': [59,
   2,
   3434,
   1013,
   823,
   2,
   887,
   6,
   2375,
   0,
   34,
   43,
   584,
   18,
   0,
   0,
   6,
   686,
   1,
   56,
   43,
   2881,
   1107,
   0,
   287,
   1495,
   9,
   2,
   1013,
   71,
   447,
   2,
   3519,
   0,
   118,
   13,
   10,
   532,
   81,
   1,
   13,
   8,
   15,
   329,
   10,
   0,
   1,
   0,
   1,
   0,
   1,
   5,
   4083,
   1372,
   2938,
   3,
   7,
   13,
   41,
   0,
   120,
   156,
   4752,
   1013,
   1,
   30,
   4083,
   11,
   269,
   661,
   114,
   4,
   2,
   3217,
   6,
   713,
   1,
   944,
   57,
   4,
   0,
   551,
   1,
   39,
   329,
   17,
   0,
   1565,
   0,
   1,
   11,
   240,
   5,
   802,
   6,
   2,
   0,
   1683,
   0,
   6,
   0,
   7,
   6,
   2,
   0,
   6,
   0,
   3,
   74,
   0,
   422,
   22,
   2,
   0,
   0,
   1,
   11,
   10,
   186,
   497,
   842,
   3490,
   144,
   33,
   2,
   0,
   6,
   451,
   3,
   202,
   73,
   11,
   55,
   774,
   608,
   1,
   11,
   32,
   1122,
 

In [None]:
SEQUENCE_LENGTH = 200


# Assuming data2 is your dataset
texts = [item['text'] for item in test]
#labels = [item['label'] for item in test]

# Pad sequences to a consistent length
def pad_sequences(sequences, maxlen=SEQUENCE_LENGTH):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            seq += [0] * (maxlen - len(seq))
        padded_sequences.append(seq[:maxlen])
    return padded_sequences

texts = pad_sequences(texts)

In [None]:
# Convert to tensors
X_test = torch.tensor(texts, dtype=torch.long)



In [None]:
texts

[[59,
  2,
  3434,
  1013,
  823,
  2,
  887,
  6,
  2375,
  0,
  34,
  43,
  584,
  18,
  0,
  0,
  6,
  686,
  1,
  56,
  43,
  2881,
  1107,
  0,
  287,
  1495,
  9,
  2,
  1013,
  71,
  447,
  2,
  3519,
  0,
  118,
  13,
  10,
  532,
  81,
  1,
  13,
  8,
  15,
  329,
  10,
  0,
  1,
  0,
  1,
  0,
  1,
  5,
  4083,
  1372,
  2938,
  3,
  7,
  13,
  41,
  0,
  120,
  156,
  4752,
  1013,
  1,
  30,
  4083,
  11,
  269,
  661,
  114,
  4,
  2,
  3217,
  6,
  713,
  1,
  944,
  57,
  4,
  0,
  551,
  1,
  39,
  329,
  17,
  0,
  1565,
  0,
  1,
  11,
  240,
  5,
  802,
  6,
  2,
  0,
  1683,
  0,
  6,
  0,
  7,
  6,
  2,
  0,
  6,
  0,
  3,
  74,
  0,
  422,
  22,
  2,
  0,
  0,
  1,
  11,
  10,
  186,
  497,
  842,
  3490,
  144,
  33,
  2,
  0,
  6,
  451,
  3,
  202,
  73,
  11,
  55,
  774,
  608,
  1,
  11,
  32,
  1122,
  22,
  2,
  4083,
  3,
  0,
  3,
  2671,
  4744,
  136,
  120,
  1013,
  3,
  270,
  4,
  945,
  157,
  2620,
  1,
  59,
  2,
  0,
  3782,
  2671,
  0,
  0,
 

In [None]:
model.eval()

# Get predictions
with torch.no_grad():
    logits = model(X_test)

predictions = torch.argmax(logits, dim=1).tolist()

# Create the desired output format
output = []
for idx, pred_class in enumerate(predictions):
    output.append({'id': idx, 'class': pred_class})

# Save to CSV
import csv

with open('predictions.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'class']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for row in output:
        writer.writerow(row)