# CNN model
### CNN model cited from 585 tutorial

In [5]:
data_path = "../data/Vaccine pages.csv"

In [56]:
import pandas as pd
import torch
import torchtext
from torchtext.data import Field, LabelField
from torchtext.data import TabularDataset
import spacy
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
import csv 
from torchtext.data import Iterator, BucketIterator
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from tqdm import tqdm, trange
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix

### Data Preprocessing

In [14]:
df = pd.read_csv(data_path)
df = df[df['Tags confirmed']=='checked'] # only read the ones are checkd
train_df, dev_df = train_test_split(df, test_size=0.2, random_state=11)

In [33]:
train_df.head()

Unnamed: 0,Unique ID,Date,Vaccines page pairs EN/FR,Comment,Tags,Refining details,Tags confirmed,Status,Issue
2266,6089e6208ae21611fc74db07,2021年4月28日,Authorized vaccines EN/FR,Where do I get the right vaccine for my age gr...,Getting vaccinated - When / Where / What,Where to get vaccinated near me,checked,New,
4943,6094192e82338d0ecce56cb9,2021年5月6日,Authorized vaccines EN/FR,I asked for FDA approved vaccines but the FDA ...,Vaccine strategy: Authorization / Eligibility ...,Vaccine agreements / authorizations / other va...,checked,New,Approval of vaccines
4672,6091458c82338d0ecce564a0,2021年5月4日,"Vaccine safety, concerns and possible side eff...",i wanted to find out about SERIOUS side effect...,Vaccine safety (health issues / ingredients / ...,Side effects: possible,checked,New,
5268,6094718a82338d0ecce56e1a,2021年5月6日,AstraZeneca: What you should know EN/FR,On me dit que les compagnies de vaccins ne son...,Vaccines - Other,Other - Vaccines,checked,New,
2366,609135d882338d0ecce56476,2021年5月4日,How to get vaccinated EN/FR,Where can I get vaccinted,Getting vaccinated - When / Where / What,Where to get vaccinated near me,checked,New,


In [40]:
def write_csv(df, path):
    """
    write csv file for each split given path
        df: dataframe
        path: path to the dataframe
    """
    tag_id = 0
    rows = []
    with open(path, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['comment','tag'])
        for idex, row in df.iterrows():
            rows.append([row["Comment"], row["Tags"]])
            tag_id += 1
        csvwriter.writerows(rows)

write_csv(train_df, "../data/vaccine_train.csv")
write_csv(dev_df, "../data/vaccine_dev.csv")

In [41]:
def whitespace_tokenizer(text):
    """
    Split the text by white space
    text: str
    return: list fo tokens
    """
    return text.split()

In [42]:
TEXT = Field(sequential=True, tokenize=whitespace_tokenizer, lower=True)
LABEL = Field(sequential=False)

In [43]:
# get train and val
train, val = TabularDataset.splits(
               path="../data/", # the root directory where the data lies
               train="vaccine_train.csv", validation="vaccine_dev.csv", # file names
               format='csv',
               skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=[('comment', TEXT), ('tag', LABEL)])

In [45]:
# take a look at one instance
print(train[0].__dict__.keys())
print(train[0].comment)
print(train[0].tag)

dict_keys(['comment', 'tag'])
['where', 'do', 'i', 'get', 'the', 'right', 'vaccine', 'for', 'my', 'age', 'group?']
Getting vaccinated - When / Where / What


In [46]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [78]:
output_size = len(LABEL.vocab.stoi)

In [49]:
token_1 = TEXT.preprocess('where do i get the right vaccine for my age group?')
print("token 1:", token_1)
token_2 = TEXT.preprocess('where do i get vaccine?')
print("token 2:", token_2)
# convert tokens to tensor
tensor = TEXT.process([token_1,token_2])
print(tensor)
print(tensor.shape)

token 1: ['where', 'do', 'i', 'get', 'the', 'right', 'vaccine', 'for', 'my', 'age', 'group?']
token 2: ['where', 'do', 'i', 'get', 'vaccine?']
tensor([[  28,   28],
        [  23,   23],
        [   2,    2],
        [  11,   11],
        [   3,   76],
        [ 382,    1],
        [   5,    1],
        [   6,    1],
        [  13,    1],
        [  97,    1],
        [1664,    1]])
torch.Size([11, 2])


In [51]:
train_iter, val_iter = BucketIterator.splits(
 (train, val), 
 batch_sizes=(64,64), # batch size for train and val
 sort_key=lambda x: len(x.comment), 
 sort=True,
 sort_within_batch=False
)



In [54]:

for batch in train_iter: # seq len, batch size 
    comments = batch.comment
    tags = batch.tag
    print(comments.shape)
    print(tags.shape)
    break

torch.Size([2, 64])
torch.Size([64])


In [57]:
class CNN_Text(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, output_size, kernel_num, region_sizes, dropout):
        '''
        vocabulary_size: vocabulary size
        embedding_dim: word embedding size
        output_size: number of classes in prediction
        kernel_num: number of kernels (number of output channels of convolutional layers)
        region_sizes: height of kernels of convolutional layers
        dropout: dropout rate
        '''
        super(CNN_Text, self).__init__()
        # the size of input channel is 1.
        Ci = 1
        
        # word embedding layer
        self.embeddings = nn.Embedding(num_embeddings = vocabulary_size, embedding_dim = embedding_dim )
        
        # convolution with kernels
        self.convolution_layers = nn.ModuleList([nn.Conv2d(in_channels = Ci, out_channels = kernel_num, kernel_size = (K, embedding_dim)) for K in region_sizes])
        
        # a dropout layer
        self.dropout = nn.Dropout(dropout) 
        
        # fully connected layer
        self.fc = nn.Linear(len(kernel_sizes) * kernel_num, output_size)

    def forward(self, x):
        # input x  [sequence length, batch size]
        
        input_embeddings = self.embeddings(x)  
        # (batch size, word_sequence, embedding_dim) word embedding

        input_embeddings = input_embeddings.permute(1,0,2)
        input_embeddings = input_embeddings.unsqueeze(1)
        #  [batch size, number of channel is one, sequence length, embeeding size]

        # convolutional layers
        convolute_outputs = [F.relu(conv(input_embeddings)).squeeze(3) for conv in self.convolution_layers]  
        
        # to get the maximum value of filtered tensor
        max_pooling_outputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in convolute_outputs] 
        
        concat_list = torch.cat(max_pooling_outputs, 1) # concatenate representations
        
        drop_output = self.dropout(concat_list)  # add drop layer
        
        fc1_output = self.fc(drop_output)  # get the fc1 using a fully connected layer
        
        final_output = F.softmax(fc1_output,dim=1)
        
        return final_output

In [79]:
# Hyper Parameters

# the vocabulary size
vocabulary_size = len(TEXT.vocab.stoi) 

# Dimension of word embedding is 300. Namely, each word is expressed by a vector that has 300 dimensions.
embedding_dim = 300 

# region size as 2, 3, and 4
kernel_sizes = [1,2]#[2,3,4] 

# the number of kernel in each region size
kernels_num = 32  

# The dropout rate is set to be 0.5.
dropout = 0.5

# The output size of labels.
output_size = output_size

# learning rate is set to be 0.01.
lr = 0.01        

# The number of iteration is set to be 5.
num_epoch = 5  

# employ class CNN_Text and assign to cnn
model = CNN_Text(vocabulary_size, embedding_dim, output_size, kernels_num, kernel_sizes, dropout)
#.to(device)

In [60]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.1)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

CNN_Text(
  (embeddings): Embedding(8063, 300)
  (convolution_layers): ModuleList(
    (0): Conv2d(1, 32, kernel_size=(2, 300), stride=(1, 1))
    (1): Conv2d(1, 32, kernel_size=(3, 300), stride=(1, 1))
    (2): Conv2d(1, 32, kernel_size=(4, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=96, out_features=2, bias=True)
)

In [61]:
# Loss and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr)   # define a optimizer for backpropagation
loss_func = nn.CrossEntropyLoss()   # define loss funtion

In [83]:
def train(model, iterator, optimizer, criterion):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        batch_input, labels = batch.comment, batch.tag
        batch_input = batch_input#.to(device)
        labels = labels#.to(device)

        optimizer.zero_grad()
        
        outputs = model(batch_input)
        #print(outputs.shape)
        #print(outputs)
        loss = criterion(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.cpu().item()

    return epoch_loss / len(iterator)

In [86]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    all_pred=[]
    all_label = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            batch_input, labels = batch.comment, batch.tag
            batch_input = batch_input#.to(device)
            labels = labels#.to(device)

            optimizer.zero_grad()

            outputs = model(batch_input)

            loss = criterion(outputs, labels)

            epoch_loss += loss.cpu().item()

            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)
            all_label.extend(labels.cpu())
    
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return epoch_loss / len(iterator), accuracy, f1score

In [89]:
MAX_EPOCH = 15
total_step = len(train_iter)
loss_list = []
acc_list = []

for epoch in trange(MAX_EPOCH, desc="Epoch"):
    train_loss = train(model, train_iter, optimizer, loss_func)  
    val_loss, val_acc, val_f1 = evaluate(model, val_iter, loss_func)

    # Create checkpoint at end of each epoch
    state_dict_model = model.state_dict() 
    state = {
        'epoch': epoch,
        'state_dict': state_dict_model,
        'optimizer': optimizer.state_dict()
        }

    torch.save(state, "../data/ckpt_cnn/CNN_TEXT_"+str(epoch+1)+".pt")

    print('\n Epoch [{}/{}], Train Loss: {:.4f}, Validation Loss: {:.4f}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}'.format(epoch+1, MAX_EPOCH, train_loss, val_loss, val_acc, val_f1))


Epoch:   7%|▋         | 1/15 [00:01<00:21,  1.50s/it]


 Epoch [1/15], Train Loss: 2.1714, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  13%|█▎        | 2/15 [00:02<00:18,  1.45s/it]


 Epoch [2/15], Train Loss: 2.1722, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  20%|██        | 3/15 [00:04<00:17,  1.42s/it]


 Epoch [3/15], Train Loss: 2.1735, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  27%|██▋       | 4/15 [00:05<00:15,  1.39s/it]


 Epoch [4/15], Train Loss: 2.1709, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  33%|███▎      | 5/15 [00:06<00:13,  1.37s/it]


 Epoch [5/15], Train Loss: 2.1731, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  40%|████      | 6/15 [00:08<00:12,  1.42s/it]


 Epoch [6/15], Train Loss: 2.1725, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  47%|████▋     | 7/15 [00:09<00:11,  1.40s/it]


 Epoch [7/15], Train Loss: 2.1736, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  53%|█████▎    | 8/15 [00:11<00:10,  1.44s/it]


 Epoch [8/15], Train Loss: 2.1708, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  60%|██████    | 9/15 [00:12<00:09,  1.52s/it]


 Epoch [9/15], Train Loss: 2.1739, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  67%|██████▋   | 10/15 [00:14<00:07,  1.49s/it]


 Epoch [10/15], Train Loss: 2.1729, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  73%|███████▎  | 11/15 [00:15<00:05,  1.49s/it]


 Epoch [11/15], Train Loss: 2.1718, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  80%|████████  | 12/15 [00:17<00:04,  1.51s/it]


 Epoch [12/15], Train Loss: 2.1727, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  87%|████████▋ | 13/15 [00:18<00:02,  1.49s/it]


 Epoch [13/15], Train Loss: 2.1722, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch:  93%|█████████▎| 14/15 [00:20<00:01,  1.44s/it]


 Epoch [14/15], Train Loss: 2.1723, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749


Epoch: 100%|██████████| 15/15 [00:21<00:00,  1.45s/it]


 Epoch [15/15], Train Loss: 2.1727, Validation Loss: 2.1681, Validation Accuracy: 0.1976, Validation F1: 0.0749





In [1]:
# test push