# Model Training
這個程式會利用之前建立的dataset來訓練RNN(GRU)模型，目前Kaggle準確率40%

#### package version
torchtext 0.11.2
torch 1.10.2

In [1]:
from torchtext.legacy import data
from torchtext import datasets
import torch

device = 'cuda'
TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

fields = [('u', TEXT), ('e',LABEL)]

#load training data
train_data, valid_data = data.TabularDataset.splits(
    path = '',
    train = 'top2000train.csv',
    validation = 'top2000valid.csv',
    format = 'csv',
    fields = fields,
    skip_header = True
)

#build iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = 16,
    device = device,
    sort_key = lambda x: len(x.u),
    sort_within_batch = True
)

print(vars(train_data.examples[0]))

  from .autonotebook import tqdm as notebook_tqdm


{'u': ['going', 'people', 'biden', 'health', 'would', 'president', 'new', 'one', 'said', 'want', 'get', 'like', 'state', 'think', 'make', 'federal', 'know', 'trump', 'states', 'first', 'great', 'china', 'also', 'public', 'care', 'got', 'even', 'act', 'us', 'year', 'see', 'go', 'climate', 'united', 'years', 'many', 'two', 'american', 'could', 'last', 'take', 'number', 'general', 'never', 'lot', 'country', 'million', 'say', 'much', 'national', 'every', 'need', 'joe', 'made', 'come', 'crime', 'right', 'back', 'look', 'use', 'data', 'attorney', 'ever', 'obama', 'time', 'coronavirus', 'law', 'local', 'since', 'may', 'energy', 'ban', 'march', 'done', 'effective', 'vice', 'give', 'imposed', 'good', 'income', 'provide', 'plan', 'criminal', 'including', 'way', 'put', 'drug', 'three', 'increase', 'rate', 'cases', 'tax', 'entry', 'help', 'making', 'money', 'information', 'amends', 'came', 'tell', 'countries', 'court', 'order', 'change', 'went', 'next', 'world', 'work', 'really', 'north', 'thank',

In [2]:
#word embedding with GloVe
from torchtext.vocab import GloVe
TEXT.build_vocab(train_data, max_size = 25000,vectors = GloVe(name='6B', dim=300))
LABEL.build_vocab(train_data)

In [3]:
#GRU model, can use attention

import torch.nn as nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.attention = Attention(embedding_dim, embedding_dim)#change next to attention_dim
        self.rnn = nn.GRU(embedding_dim, hidden_dim,num_layers = 2,bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        
    def forward(self, x):  
        x2 = self.embedding(x)
        #x3 = self.attention(x2)
        output, hidden = self.rnn(x2)
        return self.fc(output[-1,:,:])
    
    
class Attention(nn.Module):
    #https://youtu.be/lqCAfu6GI2c
    def __init__(self,embedding_dim,attention_dim):
        super(Attention,self).__init__()
        self.embedding_dim = embedding_dim
        self.attention_dim = attention_dim
        self.query = nn.Linear(embedding_dim,attention_dim,bias = False)
        self.key = nn.Linear(embedding_dim,attention_dim,bias = False)
        self.value = nn.Linear(embedding_dim,attention_dim,bias = False)
    def forward(self, inputs):
        q = self.query(inputs)
        k = self.query(inputs)
        v = self.value(inputs)
        
        attn_score = torch.matmul(q,k.transpose(-1,-2))
        softmax_score = F.softmax(attn_score,dim=-1)
        output = torch.matmul(softmax_score,v)

        return output
    



In [4]:
#training
from tqdm import trange
import torch.optim as optim
import time

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 200
OUTPUT_DIM = 3
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
print(model.embedding.weight.requires_grad)

model = model.to(device)

model.embedding.weight.data = TEXT.vocab.vectors.cuda()

optimizer = optim.SGD(model.parameters(), lr=1e-1)
criterion = nn.CrossEntropyLoss()

criterion = criterion.to(device)

epoch = 20

maxacc = 0

for i in range (epoch):
    print('epoch '+str(i+1))
    ta = time.time()
    epoch_loss = 0
    train_accuracy=0.0
    valid_accuracy=0.0
    model.train()
    for batch in train_iterator:
        optimizer.zero_grad()
        predictions = model(batch.u)#.squeeze(1)
        loss = criterion(predictions, batch.e.long())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        _,prediction=torch.max(predictions.data,1)
        train_accuracy+=int(torch.sum(prediction==batch.e.data))
    print('loss:')
    print(epoch_loss / len(train_data))
    print('acc:')
    print(train_accuracy/len(train_data))
    model.eval()
    with torch.no_grad():
        for batch in valid_iterator:
            predictions = model(batch.u)#.squeeze(1)
            _,prediction=torch.max(predictions.data,1)
            valid_accuracy+=int(torch.sum(prediction==batch.e.data))
    print('val acc:')
    print(valid_accuracy/len(valid_data))
    print(time.time()-ta)
    if valid_accuracy>maxacc:
        maxacc = valid_accuracy
        torch.save(model,'nobid.pt')
    torch.save(model,'final.pt')
    print()

True
epoch 1
loss:
0.06320909263538241
acc:
0.47652873971467474
val acc:
0.519711742263671
87.64653587341309

epoch 2
loss:
0.061552377560913106
acc:
0.5225833185343042
val acc:
0.5239508266214498
99.76498222351074

epoch 3
loss:
0.06106876713735754
acc:
0.5275557923400225
val acc:
0.532852903772785
162.7108678817749

epoch 4
loss:
0.060528519479192355
acc:
0.5351920913988042
val acc:
0.5353963543874523
203.00691080093384

epoch 5
loss:
0.060263926243475784
acc:
0.5419996448232995
val acc:
0.5442984315387877
208.07456493377686

epoch 6
loss:
0.059255483130299935
acc:
0.5526549458355532
val acc:
0.5485375158965663
211.31003046035767

epoch 7
loss:
0.05889199681980095
acc:
0.5588705380927012
val acc:
0.5591352267910131
216.82288265228271

epoch 8
loss:
0.05790541410615528
acc:
0.5706505653229148
val acc:
0.5506570580754557
215.98697710037231

epoch 9
loss:
0.05727112627732269
acc:
0.5768661575800628
val acc:
0.5612547689699025
216.3584372997284

epoch 10
loss:
0.056636040939542644
acc:
0

In [6]:
#predict test set
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_class(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    preds = model(tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()




In [5]:
#label sometimes will be incorrect
print(LABEL.vocab.stoi)

defaultdict(None, {'1': 0, '0': 1, '2': 2})


In [None]:
#generate test csv
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

model = torch.load('final.pt').to('cuda:0')
predict = []

data = "test.csv"
df = pd.read_csv(data,sep=',')
idd = []
for index,row in tqdm(df.iterrows()):
    sentence = row['sent']
    predict.append(predict_class(model,str(sentence)))
    idd.append(row['id'])
    
print(predict)

predict2 = []

for i in predict:
    if i == 1:
        predict2.append(0)
    elif i == 0:
        predict2.append(1)
    elif i == 2:
        predict2.append(2)


import csv

with open('please2.csv','a',newline = '') as fd:
    writer = csv.writer(fd)
    writer.writerow(['id','rating'])
    


for index,pred in enumerate(predict2):
    with open('please2.csv','a',newline = '') as fd:
        writer = csv.writer(fd)
        writer.writerow([idd[index],pred])