In [15]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [16]:
def getWordDict():
    raw_train = pd.read_csv('data/train.csv')
    raw_val = pd.read_csv('data/dev.csv')
    raw_test = pd.read_csv('data/test.csv')
    raw = pd.concat([raw_train, raw_val, raw_test])

    word_dict = {}
    i = 0
    for _, row in raw.iterrows():
        for token in row['text'].split():
            if token not in word_dict:
                word_dict[token] = i
                i += 1
    return word_dict

def validate(loader, model):
    n_correct = 0
    n_samples = 0
    with torch.no_grad():
        for bow, labels in loader:
            outputs = model(bow)
            predicted = outputs.data.round()
            n_samples += labels.size(0)
            n_correct += (predicted == labels).sum().item()
            
    accuracy = 100.0 * (n_correct / n_samples)
    return accuracy

def predict(loader, model):
    predicted = []
    with torch.no_grad():
        for bows, labels in loader:
            outputs = model(bows)
            for output in outputs.data.round():
                predicted.append(int(output.item()))
    return predicted

def saveAnswerToCsv(test_loader, model, path):
    test_raw = pd.read_csv('data/test.csv')
    test_raw['Category'] = predict(test_loader, model)
    test_raw.pop('text')
    test_raw.to_csv(path, index = False)

In [17]:
class CustomDataset(Dataset):
    def __init__(self, path, device, word_dict):
        self.device = device
        self.word_dict = word_dict

        raw = pd.read_csv(path)
        x_raw = raw['text']
        y_raw = raw['Category'].to_numpy()
            
        self.bows = x_raw
        self.labels = self.toTorch(y_raw).view(-1, 1)

    def __getitem__(self, index):
        bow = self.bowTransform(self.bows[index])
        bow = self.toTorch(bow)
        
        return bow, self.labels[index]

    def __len__(self):
        return len(self.labels)
    
    def bowTransform(self, text):
        result = np.zeros(len(self.word_dict))
        for token in text.split():
            index = word_dict[token]
            result[index] = 1
        return result
    
    def toTorch(self, array):
        result = torch.from_numpy(array).type(torch.FloatTensor).to(self.device)
        return result

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.input_size = input_size
        self.relu = nn.ReLU()
        self.l1 = nn.Linear(input_size, hidden_size) 
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()  
    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        out = self.sigmoid(out)
        return out

In [18]:
# Parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_size = 500
num_epochs = 5
batch_size = 1000
learning_rate = 0.001
model_name = 'model6'

In [19]:
# Get word dictionary
word_dict = getWordDict()
input_size = len(word_dict)

In [20]:
# Construct datasets
train_all = True # Modify this for validation
train_data_path = 'data/all.csv' if train_all else 'data/train.csv' 

train_dataset = CustomDataset(train_data_path, device, word_dict) #'data/all.csv' = 'data/train.csv' + 'data/dev.csv' 
val_dataset = CustomDataset('data/dev.csv', device, word_dict)
test_dataset = CustomDataset('data/test.csv', device, word_dict)

train_loader = DataLoader(dataset = train_dataset, 
                          batch_size = batch_size, 
                          shuffle = True) 
val_loader = DataLoader(dataset = val_dataset, 
                        batch_size = batch_size, 
                        shuffle = False)
test_loader = DataLoader(dataset = test_dataset, 
                         batch_size = batch_size, 
                         shuffle = False) 

In [21]:
# Define the model
model = NeuralNet(input_size, hidden_size).to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)  

# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    n_correct = 0
    n_samples = 0
    for i, (bows, labels) in enumerate(train_loader): 
        # Forward pass
        outputs = model(bows)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy
        predicted = outputs.data.round()
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()
        
        if (i+1) % 10 == 0:
            acc = 100.0 * (n_correct / n_samples)
            print (f'Epoch [{epoch + 1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}, Accuracy: {acc:.2f}')

# Save the model
torch.save(model, 'models/' + model_name +'.pth')

Epoch [1/5], Step [10/90], Loss: 0.5171, Accuracy: 70.37
Epoch [1/5], Step [20/90], Loss: 0.3809, Accuracy: 78.25
Epoch [1/5], Step [30/90], Loss: 0.3570, Accuracy: 81.61
Epoch [1/5], Step [40/90], Loss: 0.3187, Accuracy: 83.44
Epoch [1/5], Step [50/90], Loss: 0.2950, Accuracy: 84.52
Epoch [1/5], Step [60/90], Loss: 0.2839, Accuracy: 85.28
Epoch [1/5], Step [70/90], Loss: 0.2532, Accuracy: 85.97
Epoch [1/5], Step [80/90], Loss: 0.2162, Accuracy: 86.52
Epoch [1/5], Step [90/90], Loss: 0.2306, Accuracy: 86.95
Epoch [2/5], Step [10/90], Loss: 0.1644, Accuracy: 93.75
Epoch [2/5], Step [20/90], Loss: 0.1446, Accuracy: 93.75
Epoch [2/5], Step [30/90], Loss: 0.1459, Accuracy: 93.77
Epoch [2/5], Step [40/90], Loss: 0.1487, Accuracy: 93.92
Epoch [2/5], Step [50/90], Loss: 0.1553, Accuracy: 93.91
Epoch [2/5], Step [60/90], Loss: 0.1660, Accuracy: 93.94
Epoch [2/5], Step [70/90], Loss: 0.1890, Accuracy: 93.91
Epoch [2/5], Step [80/90], Loss: 0.1524, Accuracy: 93.86
Epoch [2/5], Step [90/90], Loss

In [22]:
# Load the model
model = torch.load('models/' + model_name +'.pth')

# E_in
print(f'E_in: {validate(train_loader, model):.4f}')

# E_val
if not train_all:
    print(f'E_val: {validate(val_loader, model):.4f}')

# Save the answer
saveAnswerToCsv(test_loader, model, 'ans/'+model_name+'.csv')

E_in: 98.8822
