In [1]:
import os
import torch
from torch import nn
import torchtext
from torchtext.data import get_tokenizer
import numpy as np
import pandas as pd
import json
from torch.utils.data import Dataset , DataLoader
from torch.nn.utils.rnn import pack_padded_sequence ,pad_packed_sequence
import nltk 
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string
from sklearn.metrics import f1_score
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wujh1123/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/wujh1123/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"(?:\@)\S+", "", text)
    text = re.sub(r"(?:\#)\S+", "", text)
    pun = re.compile("[" u"\u2014" u"\u2019" u"\u00a3" u"\u20ac" "\u00b4" "]+",re.UNICODE)
    text = re.sub(pun, ' ', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [3]:
def remove_emoji(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats

        u"\u3030"
        u"\u201c"
        u"\u201d"
        u"\u2049"
        u"\ufe0f"
        u"\u2026" 
        u"\u203c"
        u"\u2018"
        u"\u2066"
        u"\u201e"
        u"\u200b"
        u"\u00d7"
        u"\u2022"
        u"\u00b7"
        u"\u00b5"

                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)



In [4]:
stop_words = stopwords.words('english')
def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

In [5]:
lemmatizer = WordNetLemmatizer()
def lem_text(text):
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split(' '))
    return text

In [6]:
def preprocess(text):
    text = clean_text(text)
    text = remove_emoji(text)
    text = remove_stopwords(text)
    text = lem_text(text)
    text = text.split()
    return text

In [7]:
def load_data(path):
    with open(path,'r',encoding='utf-8') as file:
        data = json.load(file)
    # print(len(train_data))
    return data

In [8]:
s = "@4mishee @ScottMorrisonMP can keep his vaccine \ud83d\udc89 I\u2019ll take covid with its \u201c non existent \u201c symptoms over some big pharma no liability or indemnity lab \ud83e\uddea cooked vaccine thank you."
t = preprocess(s)
print(t)

['keep', 'vaccine', 'take', 'covid', 'non', 'existent', 'symptom', 'big', 'pharma', 'liability', 'indemnity', 'lab', 'cooked', 'vaccine', 'thank']


In [9]:
# train_data =load_data("train.json")
# words = set()
# for i in train_data:
#     for k,v in i['labels'].items():
#         sent_arr = set(preprocess(i['tweet']))
#         if len(sent_arr) == 0:
#             print(i['tweet'])
#             train_data.remove(i)
#             continue        
#         for j in v:
#             sent_arr = set(preprocess(j['terms']))
#             words = words | sent_arr




# words = ["","UNK"]+list(words)
# print(len(words))

In [10]:
# import spacy
# nlp = spacy.load('en_core_web_sm')
# tokenizer = get_tokenizer('spacy')
train_data =load_data("train.json")
words = set()
for i in train_data:
    sent_arr = set(preprocess(i['tweet']))
    if len(sent_arr) == 0:
        print(i['tweet'])
        train_data.remove(i)
        continue
    words = words | sent_arr



words = ["","UNK"]+list(words)
print(len(words))

[The Oxford-AstraZeneca vaccine also seems to produce relatively high rates of adverse events. If you want to dig further into this vaccine’s story and issues, I’ve laid out a more detailed rundown of the Oxford-AstraZeneca trials and sources here.] https://t.co/youZFn8KUr
10716


In [11]:
# from collections import Counter
# train_data =load_data("train.json")
# counts = Counter()
# for i in train_data:
#     sent_arr = (preprocess(i['tweet']))
#     if len(sent_arr) == 0:
#         print(i['tweet'])
#         train_data.remove(i)
#     counts.update(sent_arr)


# print("num_words before:",len(counts.keys()))
# for word in list(counts):
#     if counts[word] < 2:
#         del counts[word]
# print("num_words after:",len(counts.keys()))
# words = ["", "UNK"]
# for word in counts:
#     words.append(word)

In [12]:
from collections import Counter
# train_data =load_data("train.json")

counts = Counter()
for i in train_data:
    for k,v in i['labels'].items():
            a = []
            a.append(k)
            counts.update(a)
sum = 0 
for k,v in counts.items():
    print(f"{k},{v}")
    value = np.clip(v,a_min=300,a_max=None)
    # value = v
    counts[k] =value
    sum+= value
weigh_prob= []
# for k,v in counts.items():
#     print(k,v)
for k,v in counts.items():
    p =sum/v
    weigh_prob.append(p)
weight =torch.tensor(weigh_prob)
sum_value = torch.sum(weight)

# Normalize the tensor so that the sum of its elements is equal to one
weight = weight / sum_value
weight

ineffective,1171
unnecessary,503
pharma,889
rushed,1031
side-effect,2662
mandatory,548
country,140
ingredients,304
political,437
none,440
conspiracy,341
religious,45


tensor([0.0347, 0.0809, 0.0458, 0.0394, 0.0153, 0.0742, 0.1356, 0.1338, 0.0931,
        0.0924, 0.1193, 0.1356], dtype=torch.float64)

In [13]:
tag_list=["ineffective","unnecessary","pharma","rushed","side-effect","mandatory","country","ingredients","political","none","conspiracy","religious"]
tag_to_idx={}
for i in range(len(tag_list)):
    tag_to_idx[tag_list[i]]=i

In [14]:
vocab2idx = {k:v for v,k in enumerate(words)}
vocab_size = len(words)
vocab_size

10716

In [15]:
def load_glove_vectors(glove_file="glove.42B.300d.txt"):
    word_vectors = {}
    with open(glove_file) as f:
        for line in f :
            s = line.split(' ')
            word_vectors[s[0]] = np.array([float(x) for x in s[1:]])
    return word_vectors

In [16]:
def get_emb_matrix(pretained,words,emb_size=300):
    W = np.zeros((len(words),emb_size),dtype="float32")
    W[0] = np.zeros(emb_size,dtype="float32")
    W[1] = np.random.uniform(-0.25,0.25,emb_size)
    i = 2
    count = 0 
    no_use = []
    for word in words[2:]:
        if word in pretained:
            W[i] = pretained[word]
            count +=1
        else:
            W[i] =  np.random.uniform(-0.25,0.25,emb_size)
            no_use.append(word)
        i+=1
    rate = count/len(words)
    print("coverage rate: " ,round(rate, 4))
    print(no_use)
    print(len(no_use))
    return W

In [17]:
# test for some special unicode
punctuation = "·"

for char in punctuation:
    unicode_code = ord(char)
    print(f"Character: {char}, Unicode Code: {hex(unicode_code)}")

Character: ·, Unicode Code: 0xb7


In [18]:
import joblib
word_vecs = load_glove_vectors()
pretrain_weights = get_emb_matrix(word_vecs,words,300)
# joblib.dump(pretrain_weights, 'pretrain.pkl')

coverage rate:  0.9619
['countiune', 'bourlanot', 'livehood', 'illaria', 'vaxxers', 'hesitants', 'covidhoax', 'weirddddd', 'adiuvants', 'ginuea', 'childtem', 'clynick', 'bullthis', 'adment', 'extg', 'relibility', 'trumpwill', 'waxxed', 'funvax', 'umsuzwane', 'cominarty', 'drumps', 'drumph', 'covfefe', 'vanhi', 'surgisphere', 'unvaxxed', 'zenaca', 'vaxine', 'inru', 'whadduh', 'noorse', 'itlikehe', 'legalðical', 'sincehow', 'prention', 'sunak', 'sokolka', 'brexiteers', 'dyatt', 'thsts', 'fdacber', 'covishield', 'nanoparticules', 'vaxies', 'lemaooo', 'hudroxychloroquine', 'superspreaders', 'danielauhlig', 'elens', 'vqccine', 'vaxxx', 'sollisch', 'phillipot', 'attentuated', 'bionrech', 'youtrumpsters', 'casedemic', 'covac', 'luciferease', 'coronavac', 'intereferes', 'nutsnuts', 'safetyðical', 'cgtn', 'digong', 'authoritans', 'scamdemic', 'covice', 'pisazz', 'triggernometry', 'asymtomatic', 'reinfects', 'oxford–astrazeneca', 'kunyiwe', 'compromisseddemorats', 'andmuskrat', 'toxicvaccines', 

In [19]:


class CustomDataset(Dataset):
    def __init__(self,data, tokenizer,max_length,words):
        self.data = data 
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.words = words

    def __len__(self):
        return len(self.data)
    def __getitem__(self, index) :
        tweet = self.data[index]["tweet"]
        labels = self.data[index]["labels"]
        encode_text = self.encode(tweet,self.words)
        encode_label = self.label_encoded(labels)
        seq_len = np.count_nonzero(encode_text)
        # seq_len = len(encode_text)
        return encode_text , encode_label , seq_len
    def encode(self,text,vocab2idx):
        token = self.tokenizer(text)
        encoded = np.zeros(self.max_length,dtype=int)
        enc1 = np.array([vocab2idx.get(word,vocab2idx["UNK"]) for word in token] )
        # print(enc1.shape)
        length = min(self.max_length,len(enc1))
        encoded[:length] = enc1[:length]
        
        return encoded 
    def label_encoded(self,labels):
        encode_label = np.zeros(12,dtype=int)
        list_label = list(labels.keys())
        for i in list_label:
            idx = tag_to_idx[i]
            encode_label[idx] = 1
        # encode_label = torch.LongTensor(encode_label)
        return encode_label 

In [20]:
max_length = 300
train_data = CustomDataset(train_data,preprocess,max_length,vocab2idx)
train_loader = DataLoader(train_data,batch_size=32)

In [21]:
val_data =load_data('val.json')
for i in val_data:
    sent_arr = set(preprocess(i['tweet']))
    if len(sent_arr) == 0:
        print(i)
        val_data.remove(i)
        continue
val_data = CustomDataset(val_data,preprocess,max_length,vocab2idx)
val_dataloader = DataLoader(val_data,batch_size=1)

{'ID': '1397651228679675907', 'tweet': '      ', 'labels': {'none': [{'index': 0, 'start': 0, 'end': 1, 'terms': '@HHSGov'}]}}


In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Now is using {device} device")

Now is using cuda device


In [23]:
import torch.nn.functional as F
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

class pretrain_weight_LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim,hidden_dim,pretrain_weights,bidirectional,dropout_rate=0.6) :
        super().__init__()
        self.hidden_dim = hidden_dim
        # Embedding
        self.embeddings = nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(pretrain_weights))
        self.embeddings.weight.requires_grad = True 
        
        self.lstm = nn.LSTM(embedding_dim,hidden_dim,num_layers=1,bidirectional=bidirectional,batch_first=True)
        self.embedding_dropout = SpatialDropout(dropout_rate)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        h = int(hidden_dim*3)
        self.fc1 = nn.Linear(in_features=hidden_dim*2,out_features=h)
        self.droupt1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(h,12)
        self.droupt2 = nn.Dropout(dropout_rate)


        # Attention layer weight
        self.w_omega = nn.Parameter(torch.Tensor(hidden_dim*2,hidden_dim*2))
        self.u_omega = nn.Parameter(torch.Tensor(hidden_dim*2,1))
        self.word_bias = nn.Parameter(torch.Tensor(1, 2 * hidden_dim))

        # Uniform attention layer parameter weight by xavier
        nn.init.xavier_uniform_(self.w_omega)
        nn.init.xavier_uniform_(self.u_omega)
    def forward(self,x,s):
        # Word embeeding
        embeed = self.embeddings(x)
        embeed = self.embedding_dropout(embeed)
        # lstm
        x_pack = pack_padded_sequence(embeed,s.to('cpu'),batch_first=True, enforce_sorted=False)
        pack_out , (ht,ct) = self.lstm(x_pack)
        x,length =pad_packed_sequence(pack_out, batch_first=True)

        # attention Layer
        u = torch.tanh(torch.matmul(x,self.w_omega)+self.word_bias)
        att = torch.matmul(u,self.u_omega)
        att_score = F.softmax(att,dim=1)
        score_x = x*att_score
        ht = torch.sum(score_x,dim=1)
        ht = self.tanh(ht)
        # FC layer
        # ht = torch.cat((ht[-2,:,:], ht[-1,:,:]), dim = 1)
        out = self.droupt1(ht)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.droupt2(out)
        out = self.fc2(out)
        return out ,att_score

In [24]:
def train(dataloader,model,loss_fn,optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch , (X,y,seq) in enumerate(dataloader):
        X,y =X.to(device),y.to(device,dtype=torch.float)
        # print(seq)
        pred ,attention= model(X,seq)
        # print(pred.dtype)
        loss = loss_fn(pred,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch%200 ==0:
            loss,curr = loss.item() , batch * len(X)
            print(f"loss: {loss:>7f} [{curr:>5d}/{size:>5d}]")
    return attention

In [25]:
def evalute(dataloader,model,loss_fn):
    model.eval()
    y_pred = []
    y_target = []
    with torch.no_grad():
        for X,y,seq in dataloader:
            X,y = X.to(device) , y.to(device,dtype=torch.float)
            pred ,attention= model(X,seq)
            y_pred.extend(torch.sigmoid(pred).cpu().detach().numpy().tolist())
            y_target.extend(y.cpu().detach().numpy().tolist())
    y_preds = (np.array(y_pred)>0.5).astype(int)
    marco_f1= f1_score(y_target,y_preds,average='macro')
    print("marco f1 score : ",marco_f1)
    return marco_f1

In [26]:
def model_function(bidirectional,embedding_dim,hidden_dim,epochs,weight,weight_path):
    model_2 = pretrain_weight_LSTM(vocab_size,embedding_dim,hidden_dim,pretrain_weights,bidirectional).to(device)
    if weight == None:
        loss_fn = nn.BCEWithLogitsLoss()
    else:
        loss_fn = nn.BCEWithLogitsLoss(weight=weight)
    optimizer = torch.optim.Adam(model_2.parameters(),lr=8e-4)
    max_score = 0 
    for t in range(epochs):
        print(f"Epoch {t+1}\n-----------------------")
        attention = train(train_loader,model_2,loss_fn,optimizer)
        score = evalute(val_dataloader,model_2,loss_fn)
        # print(f"max score in {t+1} epoch : ",score)
        if score > max_score:
            max_score =score
            path = weight_path
            torch.save(model_2.state_dict(),path)
            print('save model')
    print("best score: " ,max_score)
    return model_2,attention

In [27]:
# Hyper parameter
bidirectional = True
embedding_dim = 300
hidden_dim = 64*5
epochs = 30
# weight = weight.to(device)
weight =None

In [28]:
weight_path = "./w_weight2.pth"
model_2,attention=model_function(bidirectional,embedding_dim,hidden_dim,epochs,weight,weight_path)

Epoch 1
-----------------------


loss: 0.691091 [    0/ 6955]
loss: 0.224145 [ 6400/ 6955]
marco f1 score :  0.06445115810674723
save model
Epoch 2
-----------------------
loss: 0.286297 [    0/ 6955]
loss: 0.198416 [ 6400/ 6955]
marco f1 score :  0.2274333108095453
save model
Epoch 3
-----------------------
loss: 0.246973 [    0/ 6955]
loss: 0.217835 [ 6400/ 6955]
marco f1 score :  0.3558912436052402
save model
Epoch 4
-----------------------
loss: 0.190965 [    0/ 6955]
loss: 0.180663 [ 6400/ 6955]
marco f1 score :  0.42232914795293947
save model
Epoch 5
-----------------------
loss: 0.195355 [    0/ 6955]
loss: 0.201576 [ 6400/ 6955]
marco f1 score :  0.4771359888090924
save model
Epoch 6
-----------------------
loss: 0.153664 [    0/ 6955]
loss: 0.159435 [ 6400/ 6955]
marco f1 score :  0.5149387028280817
save model
Epoch 7
-----------------------
loss: 0.128814 [    0/ 6955]
loss: 0.136494 [ 6400/ 6955]
marco f1 score :  0.5150551433635039
save model
Epoch 8
-----------------------
loss: 0.122493 [    0/ 6955]
los

In [29]:
class testDataset(Dataset):
    def __init__(self,data, tokenizer,max_length,words):
        self.data = data 
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.words = words

    def __len__(self):
        return len(self.data)
    def __getitem__(self, index) :
        tweet = self.data[index]["tweet"]
        encode_text = self.encode(tweet,self.words)
        seq_len = np.count_nonzero(encode_text)
        return encode_text  , seq_len
    def encode(self,text,vocab2idx):
        token = self.tokenizer(text)
        encoded = np.zeros(self.max_length,dtype=int)
        enc1 = np.array([vocab2idx.get(word,vocab2idx["UNK"]) for word in token] )
        # print(enc1.shape)
        length = min(self.max_length,len(enc1))
        encoded[:length] = enc1[:length]
        return encoded 

In [30]:
test_data = load_data("test.json")
test_data = testDataset(test_data,preprocess,max_length,vocab2idx)
test_loader = DataLoader(test_data,batch_size=1)

In [31]:
path = './w_weight2.pth'
model_2 = pretrain_weight_LSTM(vocab_size,embedding_dim,hidden_dim,pretrain_weights,bidirectional).to(device)
model_2.load_state_dict(torch.load(path))
model_2.eval()
y_pred = []
with torch.no_grad():
    for x,seq_len in test_loader:
        x = x.to(device)
        pred ,attention= model_2(x,seq_len)
        y_pred.extend(torch.sigmoid(pred).cpu().detach().numpy().tolist())

In [32]:
y_preds = (np.array(y_pred) > 0.5).astype(int)
y_preds

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [33]:
df = pd.DataFrame(y_preds, columns=tag_list)
df_reset = df.reset_index()
# print(df_reset.head())

In [34]:
data_rows = df_reset.to_dict(orient='records')

In [35]:
import csv
with open("submission.csv", 'w', newline='') as csvfile:
    fieldnames=["index","ineffective","unnecessary","pharma","rushed","side-effect","mandatory","country","ingredients","political","none","conspiracy","religious"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data_rows)