In [1]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import torchtext.data as data
import pandas as pd

import string
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
device = torch.device("cuda: 0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
torch.set_printoptions(precision=20)
SEED = 2020
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
DATA_DIR = "data/train.csv"
PREPROCESS_DATA_DIR = "data/preprocessed.csv"
TEST_DIR = "data/test.csv"
PREPROCESS_TEST_DIR = "data/preprocessed_test.csv"

In [5]:
traindata = pd.read_csv(DATA_DIR)
print(traindata.head())

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


In [6]:
MAKE_PREPROCESSED_DATA = 0
TRAIN_DATA = 0

In [7]:
punctuations = string.punctuation
stopwords_list = stopwords.words("english")
spacy_tokenizer = torchtext.data.utils.get_tokenizer('spacy')
lemmatizer = WordNetLemmatizer()
def preprocessing(text):
  
    def tokenizer(text):
        text = str.split(text)
        return text
    
    def remove_punctuations(sentence):
        result = "".join([w if w not in punctuations and not w.isdigit() else " " for w in sentence])
        return result
    
    def word_lemmatizer(sentence):
        result = lemmatizer.lemmatize(sentence)
        return result
    
    def word_lowercase(sentence):
        return sentence.lower()
    
    def remove_URL(text):
        url = re.compile(r'https?://\S+|www\.\S+')
        html=re.compile(r'<.*?>')
        text = html.sub(r'',text)
        text = url.sub(r'',str(text))
        return text
  
    def remove_newline(text):
        return text.rstrip("\n")
    
    def clean(sentence):
        result = []
        sentence = remove_newline(sentence)
        sentence = remove_URL(sentence)
        sentence = word_lowercase(sentence)
        sentence = word_lemmatizer(sentence)
        sentence = remove_punctuations(sentence)
        sentence = tokenizer(sentence)

        result = " ".join(sentence)
        return result
    
    #result = generate_bigrams(result)   
    text = clean(text)
    if text == "":
        text = "None"
    return text

In [8]:
import csv
from tqdm import tqdm

if MAKE_PREPROCESSED_DATA:
    with open(TEST_DIR, "r", encoding="utf8") as in_csv, open(PREPROCESS_TEST_DIR, "w", newline="", encoding="utf8") as out_csv:
        reader = csv.reader(in_csv)
        writer = csv.writer(out_csv)
        next(reader, None) # Skip header
        for row in tqdm(reader):
            row[1] = preprocessing(row[1])
            try:
                writer.writerow(row)
            except Exception as e:
                print(e)

In [9]:
def mytokenizer(sentence):
    tokens = str.split(sentence)
    return tokens

In [10]:
TEXT = data.Field(batch_first = True,
                  tokenize = mytokenizer,
                  stop_words=stopwords_list)
LABEL = data.LabelField(dtype = torch.float)
ID = data.LabelField()
ID2 = data.Field(sequential=False)

In [11]:
FIELDS = [["id",ID], ["text", TEXT], ["toxic",LABEL],["s_toxic",LABEL],
          ["obscene",LABEL],["threat",LABEL],["insult",LABEL],["id_hate",LABEL]]
TEST_FIELDS = [["id",ID2], ["text", TEXT]]

dataset = data.TabularDataset(PREPROCESS_DATA_DIR,
                              format = "csv",
                              fields=FIELDS,
                              skip_header=True)

testset = data.TabularDataset(PREPROCESS_TEST_DIR,
                              format= "csv",
                              fields=TEST_FIELDS,
                             skip_header=False)

In [12]:
print(vars(dataset.examples[1]))

{'id': '000103f0d9cfb60f', 'text': ['aww', 'matches', 'background', 'colour', 'seemingly', 'stuck', 'thanks', 'talk', 'january', 'utc'], 'toxic': '0', 's_toxic': '0', 'obscene': '0', 'threat': '0', 'insult': '0', 'id_hate': '0'}


In [13]:
import random

trainset, valset = dataset.split(split_ratio=0.8, random_state=random.seed(SEED))
print(f"Number of training samples: {len(trainset)}")
print(f"Number of validating samples: {len(valset)}")

Number of training samples: 127657
Number of validating samples: 31914


In [14]:
print(vars(ID2))

{'sequential': False, 'use_vocab': True, 'init_token': None, 'eos_token': None, 'unk_token': '<unk>', 'fix_length': None, 'dtype': torch.int64, 'preprocessing': None, 'postprocessing': None, 'lower': False, 'tokenizer_args': (None, 'en'), 'tokenize': <function _split_tokenizer at 0x000001F0F56733A8>, 'include_lengths': False, 'batch_first': False, 'pad_token': None, 'pad_first': False, 'truncate_first': False, 'stop_words': None, 'is_target': False}


In [17]:
MAX_VOCAB_SIZE = 40000
TEXT.build_vocab(trainset,
                 min_freq = 3,
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(trainset)
ID.build_vocab(trainset)
ID2.build_vocab(testset)
print(f"Unique words in training set: {len(TEXT.vocab)}")
print(f"Unique labels in training set: {len(LABEL.vocab)}")

Unique words in training set: 40002
Unique labels in training set: 2


In [23]:
print(ID2.vocab.freqs.most_common(10))

[('00001cee341fdb12', 1), ('0000247867823ef7', 1), ('00013b17ad220c46', 1), ('00017563c3f7919a', 1), ('00017695ad8997eb', 1), ('0001ea8717f6de06', 1), ('00024115d4cbde0f', 1), ('000247e83dcc1211', 1), ('00025358d4737918', 1), ('00026d1092fe71cc', 1)]


In [24]:
BATCH_SIZE = 64

train_iter, val_iter = data.BucketIterator.splits((trainset, valset),
                                                  batch_size=BATCH_SIZE,
                                                  device = device)

test_iter = data.BucketIterator(testset,
                                batch_size=BATCH_SIZE,
                                shuffle=False,
                                device = device)

In [25]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [26]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 6
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-4,weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss().to(device)
print(model)

CNN(
  (embedding): Embedding(40002, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (fc): Linear(in_features=300, out_features=6, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [27]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,122,306 trainable parameters


In [28]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [29]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [30]:
def get_labels(batch):
    toxic = batch.toxic.unsqueeze(1)
    s_toxic = batch.s_toxic.unsqueeze(1)
    obscene = batch.obscene.unsqueeze(1)
    threat = batch.threat.unsqueeze(1)
    insult = batch.insult.unsqueeze(1)
    id_hate = batch.id_hate.unsqueeze(1)
    labels = torch.cat((toxic,s_toxic,obscene,
                        threat,insult,id_hate),dim=1)
    return labels

In [31]:
from tqdm import tqdm
def train_step(model, optimizer, criterion, batch):
    batch_size = len(batch)
    model.train()
    
    optimizer.zero_grad()
    text = batch.text.view(batch_size, -1)
    labels = get_labels(batch)

    outputs = model(text)
    loss = criterion(outputs,labels)
    loss.backward()
    optimizer.step()

    return loss.item()

In [32]:
if TRAIN_DATA:
    EPOCHS = 3
    loss_list = []
    print("Start training...")
    for epoch in range(EPOCHS):
        for i, batch in enumerate(train_iter):
            train_loss = train_step(model,optimizer, criterion, batch)

            if i%400 == 0:
                print(f"Epoch: [{epoch+1}/{EPOCHS}] | Iterations: [{i+1}/{len(train_iter)}] | Training loss: {train_loss:.3f}")
                torch.save(model.state_dict(), "model/modelCNN.pt")
        #loss_list.append(train_loss)
    print("Training Completed!")

In [33]:
if not TRAIN_DATA:
    model.load_state_dict(torch.load("model/modelCNN.pt"))

In [34]:

def predict_test(model, test_iter):
    result = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_iter):
            batch_size = len(batch)
            text = batch.text.view(batch_size,-1).long()
            ids = batch.id.squeeze().cpu()
            output = model(text)
            output = torch.sigmoid(output).cpu()
            for i,j in zip(ids,output):
                result.append([ID2.vocab.itos[i.numpy()],j.numpy()])
    return result

In [35]:
result = predict_test(model, test_iter)

100%|█████████████████████████████████████████████████████████████████████████████| 2394/2394 [00:20<00:00, 114.41it/s]


In [36]:
print(result[0][1])

[0.9878593  0.04721861 0.90824836 0.00333032 0.73442763 0.04421253]


In [37]:
import csv
with open("submission.csv", "w", newline="", encoding="utf8") as f:
    writer = csv.writer(f)
    writer.writerow(["id"]+["toxic"]+["severe_toxic"]+["obscene"]+["threat"]+["insult"]+["identity_hate"])
    for line in tqdm(result):
        writer.writerow([line[0]] + [line[1][i] for i in range(6)])

100%|██████████████████████████████████████████████████████████████████████| 153164/153164 [00:00<00:00, 158323.45it/s]


In [42]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction

In [43]:
text = preprocessing("yo bitch ja rule is more succesful then you ll ever be whats up with you and hating you sad mofuckas i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me ja rule is about pride in da music man dont diss that shit on him and nothin is wrong bein like tupac he was a brother too fuckin white boys get things right next time")
print(text)

yo bitch ja rule is more succesful then you ll ever be whats up with you and hating you sad mofuckas i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me ja rule is about pride in da music man dont diss that shit on him and nothin is wrong bein like tupac he was a brother too fuckin white boys get things right next time


In [44]:
print(predict_sentiment(model,text))

tensor([[0.99372923374176025391, 0.11364216357469558716, 0.95650070905685424805,
         0.00829244125634431839, 0.85854005813598632812, 0.08795237541198730469]],
       device='cuda:0', grad_fn=<SigmoidBackward>)
