In [26]:
import os
import torch
from torch import nn
import torchtext
from torchtext.data import get_tokenizer
import numpy as np
import pandas as pd
import json
from torch.utils.data import Dataset , DataLoader

In [27]:
def load_data(path):
    with open(path,'r',encoding='utf-8') as file:
        data = json.load(file)
    # print(len(train_data))
    return data

In [28]:
# import spacy
# nlp = spacy.load('en_core_web_sm')
tokenizer = get_tokenizer('spacy')
train_data =load_data("train.json")
words = set()
for i in train_data:
    sent_arr = set(tokenizer(i['tweet']))
    words = words | sent_arr
words = ["","UNK"]+list(words)
print(len(words))



26345


In [29]:
tag_list=["ineffective","unnecessary","pharma","rushed","side-effect","mandatory","country","ingredients","political","none","conspiracy","religious"]
tag_to_idx={}
for i in range(len(tag_list)):
    tag_to_idx[tag_list[i]]=i

In [30]:
print(type(train_data[0]['labels']))
print(train_data[0]['labels'])
print(list(train_data[0]['labels'].keys()))
for i in list(train_data[77]['labels'].keys()):
    print(tag_to_idx[i],i)

<class 'dict'>
{'ineffective': [{'index': 0, 'start': 6, 'end': 10, 'terms': 'cant control the Flu'}, {'index': 1, 'start': 37, 'end': 39, 'terms': 'infectious diseases'}]}
['ineffective']
4 side-effect


In [31]:
vocab2idx = {k:v for v,k in enumerate(words)}
vocab_size = len(words)

In [32]:
def load_glove_vectors(glove_file="glove.6B.300d.txt"):
    word_vectors = {}
    with open(glove_file) as f:
        for line in f :
            s = line.split()
            word_vectors[s[0]] = np.array([float(x) for x in s[1:]])
    return word_vectors

In [33]:
def get_emb_matrix(pretained,words,emb_size=300):
    W = np.zeros((len(words),emb_size),dtype="float32")
    W[0] = np.zeros(emb_size,dtype="float32")
    W[1] = np.random.uniform(-0.25,0.25,emb_size)
    i = 2
    for word in words[2:]:
        if word in pretained:
            W[i] = pretained[word]
        else:
            W[i] =  np.random.uniform(-0.25,0.25,emb_size)
        i+=1
    return W

In [34]:
word_vecs = load_glove_vectors()
pretrain_weights = get_emb_matrix(word_vecs,words)

In [35]:


class CustomDataset(Dataset):
    def __init__(self,data, tokenizer,max_length,words):
        self.data = data 
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.words = words

    def __len__(self):
        return len(self.data)
    def __getitem__(self, index) :
        tweet = self.data[index]["tweet"]
        labels = self.data[index]["labels"]
        encode_text = self.encode(tweet,self.words)
        encode_label = self.label_encoded(labels)
        seq_len = np.count_nonzero(encode_text)
        return encode_text , encode_label , seq_len
    def encode(self,text,vocab2idx):
        token = self.tokenizer(text)
        encoded = np.zeros(self.max_length,dtype=int)
        enc1 = np.array([vocab2idx.get(word,vocab2idx["UNK"]) for word in token] )
        # print(enc1.shape)
        length = min(self.max_length,len(enc1))
        encoded[:length] = enc1[:length]
        return encoded 
    def label_encoded(self,labels):
        encode_label = np.zeros(12,dtype=int)
        list_label = list(labels.keys())
        for i in list_label:
            idx = tag_to_idx[i]
            encode_label[idx] = 1
        # encode_label = torch.LongTensor(encode_label)
        return encode_label 

In [36]:
train_data = CustomDataset(train_data,tokenizer,30,vocab2idx)
train_loader = DataLoader(train_data,batch_size=32)

In [37]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Now is using {device} device")

Now is using cuda device


In [38]:
from torch.nn.utils.rnn import pack_padded_sequence

In [39]:
def train(dataloader,model,loss_fn,optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch , (X,y,seq) in enumerate(dataloader):
        X,y =X.to(device),y.to(device,dtype=torch.float)
        # print(seq)
        pred = model(X,seq)
        # print(pred.dtype)
        loss = loss_fn(pred,y)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch%200 ==0:
            loss,curr = loss.item() , batch * len(X)
            print(f"loss: {loss:>7f} [{curr:>5d}/{size:>5d}]")

In [40]:
class pretrain_weight_LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim,hidden_dim,pretrain_weights,bidirectional) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(pretrain_weights))
        self.embeddings.weight.requires_grad = True 
        self.lstm = nn.LSTM(embedding_dim,hidden_dim,bidirectional=bidirectional,dropout=0.1,batch_first=True)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(in_features=hidden_dim,out_features=hidden_dim)
        self.droupt = nn.Dropout(0.3)
        self.linear = nn.Linear(hidden_dim,12)
    def forward(self,x,s):
        embeed = self.embeddings(x)
        x_pack = pack_padded_sequence(embeed,s.to('cpu'),batch_first=True, enforce_sorted=False)
        pack_out , (ht,ct) = self.lstm(x_pack)
        out = self.droupt(ht[-1])
        out = self.fc1(out)
        out = self.relu(out)
        out = self.droupt(out)
        out = self.linear(out)
        return out 

In [41]:
bidirectional = False
model_2 = pretrain_weight_LSTM(vocab_size,300,128,pretrain_weights,bidirectional).to(device)



In [42]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_2.parameters(),lr=1e-3)

In [43]:
epochs = 200
for t in range(epochs):
    print(f"Epoch {t+1}\n-----------------------")
    train(train_loader,model_2,loss_fn,optimizer)
    # test(vaild_loader,model_2,loss_fn)

Epoch 1
-----------------------
loss: 0.692355 [    0/ 6956]


loss: 0.275161 [ 6400/ 6956]
Epoch 2
-----------------------
loss: 0.306999 [    0/ 6956]
loss: 0.247247 [ 6400/ 6956]
Epoch 3
-----------------------
loss: 0.269714 [    0/ 6956]
loss: 0.209759 [ 6400/ 6956]
Epoch 4
-----------------------
loss: 0.239864 [    0/ 6956]
loss: 0.199926 [ 6400/ 6956]
Epoch 5
-----------------------
loss: 0.228741 [    0/ 6956]
loss: 0.182685 [ 6400/ 6956]
Epoch 6
-----------------------
loss: 0.196699 [    0/ 6956]
loss: 0.151587 [ 6400/ 6956]
Epoch 7
-----------------------
loss: 0.171403 [    0/ 6956]
loss: 0.130859 [ 6400/ 6956]
Epoch 8
-----------------------
loss: 0.130768 [    0/ 6956]
loss: 0.112002 [ 6400/ 6956]
Epoch 9
-----------------------
loss: 0.124236 [    0/ 6956]
loss: 0.092499 [ 6400/ 6956]
Epoch 10
-----------------------
loss: 0.113724 [    0/ 6956]
loss: 0.065717 [ 6400/ 6956]
Epoch 11
-----------------------
loss: 0.067655 [    0/ 6956]
loss: 0.047528 [ 6400/ 6956]
Epoch 12
-----------------------
loss: 0.059654 [    0/ 6956]
loss: 0

In [None]:
class testDataset(Dataset):
    def __init__(self,data, tokenizer,max_length,words):
        self.data = data 
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.words = words

    def __len__(self):
        return len(self.data)
    def __getitem__(self, index) :
        tweet = self.data[index]["tweet"]
        encode_text = self.encode(tweet,self.words)
        seq_len = np.count_nonzero(encode_text)
        return encode_text  , seq_len
    def encode(self,text,vocab2idx):
        token = self.tokenizer(text)
        encoded = np.zeros(self.max_length,dtype=int)
        enc1 = np.array([vocab2idx.get(word,vocab2idx["UNK"]) for word in token] )
        # print(enc1.shape)
        length = min(self.max_length,len(enc1))
        encoded[:length] = enc1[:length]
        return encoded 

In [None]:
test_data = load_data("test.json")
test_data = testDataset(test_data,tokenizer,30,vocab2idx)
test_loader = DataLoader(test_data,batch_size=1)

In [None]:
model_2.eval()

y_pred = []
with torch.no_grad():
    for x,seq_len in test_loader:
        x = x.to(device)
        pred = model_2(x,seq_len)
        y_pred.extend(torch.sigmoid(pred).cpu().detach().numpy().tolist())

In [None]:
y_preds = (np.array(y_pred) > 0.5).astype(int)
y_preds

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:

df = pd.DataFrame(y_preds, columns=tag_list)
df_reset = df.reset_index()
print(df_reset.head())

   index  ineffective  unnecessary  pharma  rushed  side-effect  mandatory  \
0      0            0            0       0       0            0          0   
1      1            0            0       0       0            1          0   
2      2            0            0       1       1            0          0   
3      3            0            0       0       0            1          0   
4      4            0            0       0       0            0          0   

   country  ingredients  political  none  conspiracy  religious  
0        0            0          1     0           0          0  
1        0            1          0     0           0          0  
2        0            0          0     0           0          0  
3        0            0          0     0           0          0  
4        1            0          0     0           0          0  


In [None]:
data_rows = df_reset.to_dict(orient='records')

In [None]:
import csv
with open("submission.csv", 'w', newline='') as csvfile:
    fieldnames=["index","ineffective","unnecessary","pharma","rushed","side-effect","mandatory","country","ingredients","political","none","conspiracy","religious"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data_rows)