In [2]:
import torch
import pandas as pd
#import numpy as np
# nlp library of Pytorch
from torchtext import data
#from torchtext.legacy import data
#from torchtext.legacy import data
#from torchtext.legacy import datasets

import warnings as wrn

from PIL import Image
from torchvision.transforms import ToTensor, ToPILImage
from torchvision import transforms, utils
import numpy as np
import random
import torchvision

import tarfile
import io
import os

from torch.utils.data import Dataset

wrn.filterwarnings('ignore')
SEED = 2021

torch.manual_seed(SEED)
torch.backends.cuda.deterministic = True

In [2]:
data_ = pd.read_csv('./csv/train.csv')
data_.head()
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23982 entries, 0 to 23981
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   clean_title  23982 non-null  object
 1   id           23982 non-null  object
 2   label        23982 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 562.2+ KB


In [3]:
# Field is a normal column 
# LabelField is the label column.

import spacy
nlp = spacy.load("en_core_web_lg")
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in nlp.tokenizer(text)]

TEXT = data.Field(tokenize=tokenizer,batch_first=True,include_lengths=True)
ID = data.Field(dtype = torch.float,batch_first=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [4]:
fields = [('clean_title',TEXT), ('id', ID), ('label',LABEL)]

In [5]:
training_data = data.TabularDataset(path="./csv/train.csv",
                                    format="csv",
                                    fields=fields,
                                    skip_header=True
                                   )

test_data = data.TabularDataset(path="./csv/validate.csv",
                                    format="csv",
                                    fields=fields,
                                    skip_header=True
                                   )


In [6]:
import random
# train and validation splitting
train_data,valid_data = training_data.split(split_ratio=0.5,
                                            random_state=random.seed(SEED))

In [7]:
TEXT.build_vocab(training_data,test_data)
ID.build_vocab(training_data,test_data)
LABEL.build_vocab(training_data,test_data)


In [8]:
print("Size of text vocab:",len(TEXT.vocab))
print("Size of label vocab:",len(LABEL.vocab))
TEXT.vocab.freqs.most_common(10)

Size of text vocab: 26847
Size of label vocab: 6


[('the', 9337),
 ('a', 6954),
 ('of', 4911),
 ('in', 4622),
 ('to', 4528),
 ('this', 4335),
 ('my', 2527),
 ('i', 2514),
 ('on', 2422),
 ('and', 2291)]

In [9]:
device = torch.device("cuda")

BATCH_SIZE = 1

train_iterator,validation_iterator = data.BucketIterator.splits(
    (train_data,valid_data),
    batch_size = BATCH_SIZE,
    # Sort key is how to sort the samples
    sort_key = lambda x:len(x.clean_title),
    sort_within_batch = True,
    device = device
)

test_iterator = data.BucketIterator(
    dataset = test_data,
    batch_size = BATCH_SIZE,
    train = False,
    sort_key = False,
    sort = False,
    sort_within_batch = False,
    device = device
)

In [10]:
print(vars(test_iterator.data()[1]))

{'clean_title': ['new', 'image', 'from', 'the', 'mandalorian'], 'id': ['d0bzlq'], 'label': '0'}


In [11]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self,vocab_size):
        super(Net,self).__init__()
        #lstm part
        self.vocab_size = vocab_size
        self.word_embed = nn.Embedding(vocab_size, 100)
        self.layer = nn.LSTM(input_size=100, hidden_size=64, num_layers = 2, batch_first=True, bidirectional=False)
        
        #combine part
        self.drop = nn.Dropout(p=0.3)
        self.label1 = nn.Linear(64,64)
        self.acti1 = nn.ReLU()
        self.label2 = nn.Linear(64,64)
        self.acti2 = nn.ReLU()
        self.label3 = nn.Linear(64,6)
        self.acti3 = nn.Sigmoid()
        #self.label6 = nn.Linear(6,6)
        
    def forward(self, text, text_length):
        #lstm part
        embed = self.word_embed(text)
        #print(embed.shape)
        out, _ = self.layer(embed)
        outs = out[range(len(out)), text_length - 1, :64]
        #print(outs.shape)
        
        #cnn part
        #x = self.layer1(image)
        #x = self.acti1(x)
        #x = self.pool1(x)
        #x = self.layer2(x)
        #x = self.acti1(x)
        #x = self.flat(x)
        #print(x.size())
        #x = self.layer3(x)
        #output_cnn = self.acti3(x)
        
        #combined = torch.cat((outs,output_cnn), 1)
        
        #print(outs.shape)
        tag = self.label1(outs)
        tag_score = self.acti1(tag)
        #tag_score = self.drop(tag_score)
        #tag_score = self.label2(tag_score)
        #tag_score = self.acti2(tag_score)
        #tag_score = self.drop(tag_score)
        tag_score = self.label3(tag_score)
        tag_score = self.acti3(tag_score)
        #print(tag_score.shape)
        #result = torch.argmax(tag_score,dim=1,keepdim=True)
        #result = self.label6(tag_score)
        return tag_score
    
    

In [12]:
model = Net(len(TEXT.vocab))

In [13]:
import torch.optim as optim
model = model.to(device)
optimizer = optim.Adam(model.parameters(),lr=1e-3)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [14]:
def active_label(tensor):
    vec = []
    for i in range(1):
        if tensor[i] == 0:
            vec = [1,0,0,0,0,0]
        elif tensor[i] == 1:
            vec = [0,1,0,0,0,0]
        elif tensor[i] == 2:
            vec = [0,0,1,0,0,0]
        elif tensor[i] == 3:
            vec = [0,0,0,1,0,0]
        elif tensor[i] == 4:
            vec = [0,0,0,0,1,0]
        elif tensor[i] == 5:
            vec = [0,0,0,0,0,1]
    return  torch.tensor(vec, dtype=torch.float, device=device)

In [15]:
def accuracy(predict, target):
    correct = 0
    #print(predict)
    actual = torch.argmax(predict,dim=0,keepdim=True).squeeze()
    #print(actual.item())
    for i in range(1):
        if actual.item() == target.item():
            correct += 1
    
    return correct

In [16]:
from torch.autograd import Variable
import torchtext 
from PIL import Image
import torchvision.transforms.functional as TF

def train(model,iterator,optimizer,criterion):
    directory = "./data/" 
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    idlist = ID.vocab.itos
    model.train()
    i = 1
    #print("gets there")
    for batch in iterator:
        #print("gets here")
        # cleaning the cache of optimizer
        optimizer.zero_grad()
        
        #keep track of process
        if i % 5000 == 0:
            print(i)
        i += 1
        
        text,text_lengths = batch.clean_title
        
        #modified

        if(text.shape[0] == 1):
            # forward propagation and squeezing
            predictions = model(text,text_lengths).squeeze()
            #print(predictions)
            #print(batch.country)
            target = active_label(batch.label)
            #print(target)
            #print(torch.unsqueeze(batch.country,1).type())
            # computing loss / backward propagation
            loss = criterion(predictions,target)
            #loss = Variable(loss, requires_grad = True)
            loss.backward()

            # accuracy
            acc = accuracy(predictions,batch.label)
            # updating params
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc
    # It'll return the means of loss and accuracy
    return epoch_loss / (len(iterator)), epoch_acc / (len(iterator))

In [17]:
def evaluate(model,iterator,criterion):
    directory = "./data/" 
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    idlist = ID.vocab.itos
    model.eval()
    
    pred = []
    
    with torch.no_grad():
        i = 1
        #print("gets there")
        for batch in iterator:
            #keep track of process
            if i % 5000 == 0:
                print(i)
            i += 1

            text,text_lengths = batch.clean_title
           
            #print(x.shape)
            #print(ids.shape)
            #modified

            if(text.shape[0] == 1):
                # forward propagation and squeezing
                predictions = model(text,text_lengths).squeeze()
                #print(predictions)
                #print(batch.country)
                target = active_label(batch.label)
                #print(target)
                #print(torch.unsqueeze(batch.country,1).type())
                # computing loss / backward propagation
                loss = criterion(predictions,target)
                # accuracy
                acc = accuracy(predictions,batch.label)
                
                pred += [torch.argmax(predictions,dim=0,keepdim=True).squeeze()]
                
                epoch_loss += loss.item()
                epoch_acc += acc
    # It'll return the means of loss and accuracy
    return epoch_loss / (len(iterator)), epoch_acc / (len(iterator)), pred

In [18]:
EPOCH_NUMBER = 3
for epoch in range(1,EPOCH_NUMBER+1):
    print(epoch)
    
    train_loss,train_acc = train(model,train_iterator,optimizer,criterion)
    
    valid_loss,valid_acc,_ = evaluate(model,validation_iterator,criterion)
    
    # Showing statistics
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print()

1
5000
10000
5000
10000
	Train Loss: 1.448 | Train Acc: 56.35%
	 Val. Loss: 1.446 |  Val. Acc: 56.00%

2
5000
10000
5000
10000
	Train Loss: 1.406 | Train Acc: 59.06%
	 Val. Loss: 1.410 |  Val. Acc: 58.96%

3
5000
10000
5000
10000
	Train Loss: 1.372 | Train Acc: 62.20%
	 Val. Loss: 1.414 |  Val. Acc: 58.68%



In [19]:
test_loss,test_acc,pred = evaluate(model,test_iterator,criterion)
    
# Showing statistics
print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')
print()

5000
	 Val. Loss: 1.414 |  Val. Acc: 59.11%



In [20]:
len(pred)

7995

In [21]:
predics = []
for i in range(len(pred)):
    predics += [pred[i].item()]

In [22]:
output = pd.read_csv('./csv/test_clean.csv')

In [23]:
output["Text_Basline"] = predics

In [24]:
output.to_csv('./csv/test_clean_n.csv')