## Data Loading and Processing

In [142]:
from __future__ import print_function, division
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [11]:
class ToxicDataset(Dataset):
    
    def __init__(self, csv_file, root_dir, transform=None):
        csv_path = os.path.join(root_dir, csv_file)
        self.dataframe = pd.read_csv(csv_path)
        self.root_dir = root_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx, 1]
        if self.transform:
            sample = self.transform(sample)
        return sample
        

In [12]:
toxic_dataset = ToxicDataset("train.csv", "data")

In [14]:
toxic_dataset[1]

'"\n\n Please do not vandalize pages, as you did with this edit to W. S. Merwin. If you continue to do so, you will be blocked from editing.    "'

## Deep Learning for NLP with Pytorch
### Deep learning with Pytorch

In [110]:
import string

import pandas as pd
import numpy as np

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x104ec83d0>

In [111]:
train = pd.read_csv("data/train.csv")

In [113]:
# Create labels for comments that aren't toxic
def none_marker(x):
    if x > 0:
        return 0
    else:
        return 1
train['none'] = train.iloc[:,2:]\
    .sum(axis=1)\
    .apply(lambda x: none_marker(x))
    
# Remove multi-labels for now...
multilabels_ix = train.iloc[:,2:]\
    .sum(axis=1)\
    .apply(lambda x: x < 2)
train_one_label = train.loc[multilabels_ix,:]
labels = train_one_label.iloc[:,2:].as_matrix()
labels_ix = train_one_label.columns[2:]
ix_1 = np.where(labels == 1)

In [119]:
labels_str = np.array([labels_ix[i] for i in ix_1[1]])
def transform_comment(comment):
    result = comment\
        .translate(None, string.punctuation)\
        .lower()\
        .split()
    return result
comment_split = train_one_label['comment_text'].apply(lambda x: transform_comment(x))

In [122]:
data = zip(comment_split, labels_str)

In [125]:
# split between train and test
split = int(len(data) * 0.8)
train = data[:split]
test = data[split:]

In [126]:
len(data) == (len(train) + len(test))

True

In [134]:
# Convert words to unique integer
word_to_ix = {}
for sent, _ in train + test:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
label_to_ix = {}
for _, label in train + test:
    if label not in label_to_ix:
        label_to_ix[label] = len(label_to_ix)

In [136]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)

In [137]:
print(VOCAB_SIZE)
print(NUM_LABELS)

178059
6


In [145]:
class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        # Log Reg with input size of "vocab_size" and output of "num_labels"
        self.linear = nn.Linear(vocab_size, num_labels)
    
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)

def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1) # single row with many columns

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

In [146]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

In [150]:
for param in model.parameters():
    # A = NUM_LABELS x VOCAB_SIZE = targets x inputs
    # B = error
    print(param)

Parameter containing:
 1.2211e-03 -1.0460e-03 -4.5942e-04  ...   1.9309e-03 -5.4459e-04  6.8789e-04
 1.1627e-03 -2.2230e-03 -1.7261e-03  ...  -1.4355e-03  1.0937e-03  1.4988e-03
-9.8975e-04 -2.5441e-04 -4.3393e-04  ...   1.9290e-03  1.2395e-03 -2.2232e-03
-1.7802e-04  1.4664e-03 -1.1496e-03  ...   1.4417e-04 -1.0338e-03 -6.8515e-04
 3.2670e-04 -8.0285e-04 -1.8626e-03  ...   6.6961e-04 -3.2703e-04 -7.3321e-06
-3.8971e-04 -1.8819e-03  7.4758e-04  ...  -1.7646e-03 -1.1262e-03 -9.0539e-04
[torch.FloatTensor of size 6x178059]

Parameter containing:
1.00000e-03 *
  1.7748
 -0.6637
 -1.6992
  0.1864
 -1.0139
 -1.2176
[torch.FloatTensor of size 6]



In [156]:
sample = train[0]
bow_vector = make_bow_vector(sample[0], word_to_ix)
log_probs = model(autograd.Variable(bow_vector))
print(log_probs) # obvious no merit results

Variable containing:
-1.7879 -1.7888 -1.7971 -1.7907 -1.7954 -1.7907
[torch.FloatTensor of size 1x6]



In [158]:
# Prediction before training
for sentence, label in test[:10]:
    bow_vec = autograd.Variable(make_bow_vector(sentence, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)

Variable containing:
-1.7876 -1.7915 -1.7875 -1.7955 -1.7976 -1.7910
[torch.FloatTensor of size 1x6]

Variable containing:
-1.7813 -1.8028 -1.7982 -1.7842 -1.7923 -1.7919
[torch.FloatTensor of size 1x6]

Variable containing:
-1.8065 -1.8157 -1.8029 -1.7431 -1.7858 -1.7982
[torch.FloatTensor of size 1x6]

Variable containing:
-1.7961 -1.7895 -1.7924 -1.7953 -1.7908 -1.7865
[torch.FloatTensor of size 1x6]

Variable containing:
-1.7863 -1.7934 -1.7953 -1.7930 -1.7963 -1.7864
[torch.FloatTensor of size 1x6]

Variable containing:
-1.8049 -1.7848 -1.8054 -1.7878 -1.7835 -1.7845
[torch.FloatTensor of size 1x6]

Variable containing:
-1.7972 -1.7883 -1.7780 -1.7963 -1.7956 -1.7952
[torch.FloatTensor of size 1x6]

Variable containing:
-1.7951 -1.7970 -1.7866 -1.7858 -1.8200 -1.7667
[torch.FloatTensor of size 1x6]

Variable containing:
-1.7844 -1.7810 -1.7867 -1.8195 -1.7989 -1.7807
[torch.FloatTensor of size 1x6]

Variable containing:
-1.7808 -1.7922 -1.8100 -1.7866 -1.7966 -1.7847
[torch.FloatT

In [160]:
# Print parameters for 'warmongering'
print(next(model.parameters())[:, word_to_ix['warmongering']])

Variable containing:
1.00000e-03 *
 -0.2748
 -1.2245
  0.5559
 -0.6667
  0.3622
 -1.7945
[torch.FloatTensor of size 6]



In [162]:
# Define loss function and optimizing method
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [164]:
# Train
for epoch in range(100):
    for sentence, label in train[:100]:
    # for sentence, label in train:
        # Step 1: clear accumulated gradients
        model.zero_grad()
        
        # Step 2: Make input and label
        bow_vec = autograd.Variable(make_bow_vector(sentence, word_to_ix))
        target = autograd.Variable(make_target(label, label_to_ix))
        
        # Step 3: Forward
        log_probs = model(bow_vec)
        
        # Step 4: Backward
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

In [181]:
# Predict
for sentence, label in test[:10]:
    bow_vec = autograd.Variable(make_bow_vector(sentence, word_to_ix))
    log_probs = model(bow_vec)
    # print(log_probs)
    print(log_probs.data.numpy().argmax() == label_to_ix[label])
    

True
True
True
True
True
True
True
True
True
True
