# CNN Pipeline Testing
Code to put embedded data through pytorch

In [13]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
def zero_padding(list_to_pad, max_length, pad_dimension):
    """
    This function takes a list and add list of zeros until max_length is reached.
    The number of zeroes in added list is determined by pad_dimension, which is the 
    same as the dimension of the word2vec model.
    This function is intended to handle one list only so it can be passed 
    into a dataframe as a lambda function.
    """
    # find number of padding vector needed
    num_pad = max_length - len(list_to_pad)

    # vector_pad = np.zeros(pad_dimension)
    vector_pad = np.asarray([0] * pad_dimension, dtype=np.float32)
    vector_pad = [vector_pad]    # convert to list of np.ndarray so we can append together 

    iteration = 0
    while iteration < num_pad:
        list_to_pad = np.append(list_to_pad, vector_pad, axis=0)
        iteration += 1
    
    return list_to_pad

In [3]:
%%time
## 1. load dataset
df = pd.read_csv('../data/ag_news/train.csv')

## 2. apply tokenization and embedding
df['text_token'] = df['Description'].apply(lambda x: word_tokenize(x))

w2v = Word2Vec.load('../model/w2v/ag_news.model')
df['embedding'] = df['text_token'].apply(lambda x: w2v[x])
temp = df['embedding'][0]
print(f'After embedding type: {type(temp)}')

## 3. zero pad to max length
df['text_length'] = df['text_token'].apply(lambda x: len(x))
max_length = max(df['text_length'])

print(f'max length: {max_length}')

emb_dim = 50
df['embedding'] = df['embedding'].apply(lambda x: zero_padding(x, max_length, emb_dim))

  


After embedding type: <class 'numpy.ndarray'>
max length: 245


In [20]:
df['embedding'] = df['embedding'].apply(lambda x: zero_padding(x, max_length, emb_dim))

test = df['embedding']

print(type(test))
print(test[:2])
print(len(test[0]))
print(torch.tensor(test[0]))

<class 'pandas.core.series.Series'>
0    [[1.7402203, -0.44056985, 4.514008, 13.054911,...
1    [[1.7402203, -0.44056985, 4.514008, 13.054911,...
Name: embedding, dtype: object
245
tensor([[ 1.7402, -0.4406,  4.5140,  ..., -6.5562,  1.4705,  0.0400],
        [ 1.4584,  1.9664,  0.2544,  ..., -6.6432, -1.0077, -3.2450],
        [-0.0336,  0.1047, -0.0686,  ...,  0.0194, -0.0861,  0.0348],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])


In [32]:
list_to_append = []

for array in df['embedding']:
    list_to_append.append(torch.tensor(array))

In [33]:
for tensor in list_to_append[:5]:
    print(tensor)

tensor([[ 1.7402, -0.4406,  4.5140,  ..., -6.5562,  1.4705,  0.0400],
        [ 1.4584,  1.9664,  0.2544,  ..., -6.6432, -1.0077, -3.2450],
        [-0.0336,  0.1047, -0.0686,  ...,  0.0194, -0.0861,  0.0348],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
tensor([[ 1.7402, -0.4406,  4.5140,  ..., -6.5562,  1.4705,  0.0400],
        [ 1.4584,  1.9664,  0.2544,  ..., -6.6432, -1.0077, -3.2450],
        [-0.2932,  0.5858,  0.4607,  ..., -0.2427, -0.5295,  0.5203],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
tensor([[ 1.7402, -0.4406,  4.5140,  ..., -6.5562,  1.4705,  0.0400],
        [ 1.4584,  1.9664,  0.2544,  ..., -6.6432, -1.0077, -3

## Convert to Dataloader
Need to convert list of tensor to dataloader to feed through pytorch NN  
Reference: https://stackoverflow.com/questions/44429199/how-to-load-a-list-of-numpy-arrays-to-pytorch-dataset-loader

In [34]:
df['Class Index'].value_counts()

4    30000
3    30000
2    30000
1    30000
Name: Class Index, dtype: int64

In [67]:
train_x = df['embedding'].tolist()[:1000]
tensor_x = torch.tensor(train_x)

In [68]:
train_y = df['Class Index'].tolist()[:1000]
tensor_y = torch.tensor(train_y, dtype=torch.long)
set(train_y)

{1, 2, 3, 4}

In [69]:
my_dataset = TensorDataset(tensor_x, tensor_y) # create your datset
my_dataloader = DataLoader(my_dataset, batch_size=32) # create your dataloader

## Run Through NN

In [70]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(245 * 50 * 1, 120) # 120 chosen randomly (< 245*50*1)
        self.fc2 = nn.Linear(120, 50)           # 50 chosen randomly (< 50)
        self.fc3 = nn.Linear(50, 4)             # 4 = number of classes
    
    def forward(self, x):
        x = x.view(-1, 245 * 50 * 1)   # token length, w2v embedding dimension, channel
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()
    
# define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# train on GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
print(f'\ndevice: {device}')

net.to(device)


device: cuda:0


Net(
  (fc1): Linear(in_features=12250, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=4, bias=True)
)

In [74]:
for epoch in range(5):
    running_loss = 0.0
    print(f'\nepoch {epoch + 1}')
    for i, data in enumerate(my_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        print(f'\tbatch {i+1} loss:    {loss.item()}')
        running_loss = 0.0


epoch 1
	batch 1 loss:    0.06903758645057678
	batch 2 loss:    0.12783314287662506
	batch 3 loss:    0.06776856631040573
	batch 4 loss:    0.017403386533260345
	batch 5 loss:    -0.036544062197208405
	batch 6 loss:    -0.014093950390815735
	batch 7 loss:    0.03697685897350311
	batch 8 loss:    0.01483616977930069
	batch 9 loss:    0.017612140625715256
	batch 10 loss:    0.029870377853512764
	batch 11 loss:    0.005718499422073364
	batch 12 loss:    0.02241034060716629
	batch 13 loss:    -0.005740150809288025
	batch 14 loss:    -0.03078792244195938
	batch 15 loss:    0.03170553594827652
	batch 16 loss:    0.07191524654626846
	batch 17 loss:    0.03496114909648895
	batch 18 loss:    0.06774330139160156
	batch 19 loss:    0.0821763277053833
	batch 20 loss:    0.1176464706659317
	batch 21 loss:    0.06503792852163315
	batch 22 loss:    0.13735079765319824
	batch 23 loss:    0.13366611301898956
	batch 24 loss:    0.12285207957029343
	batch 25 loss:    0.4025611877441406
	batch 26 loss:  