In [1]:
import torch
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

In [2]:
import os
os.chdir("..")

### Read data and train the model

In [3]:
categories = ["comp.graphics","sci.space","rec.sport.baseball"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('total texts in train:',len(newsgroups_train.data))
print('total texts in test:',len(newsgroups_test.data))

total texts in train: 1774
total texts in test: 1180


In [4]:
# newsgroups_train.target
# newsgroups_train.target_names
newsgroups_test.target_names

['comp.graphics', 'rec.sport.baseball', 'sci.space']

In [5]:
# Each one is the list of strings
data_train, data_test = newsgroups_train.data, newsgroups_test.data

In [56]:
print(total_words)

119930


In [6]:
vocab = Counter()

for text in newsgroups_train.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
for text in newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    index2word = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i
        index2word[i] = word.lower()
        
    return word2index, index2word

word2index, index2word = get_word_2_index(vocab)

In [7]:
def convert2bow(texts):
    bow = np.zeros((len(texts), total_words))
    for text_idx, text in enumerate(texts):
        for word in text.split(' '):
            bow[text_idx, word2index[word.lower()]] += 1
    return bow

X_train, X_test = torch.tensor(convert2bow(data_train), dtype=torch.float32), torch.tensor(convert2bow(data_test), dtype=torch.float32)
y_train, y_test = newsgroups_train.target, newsgroups_test.target
y_train_names, y_test_names = newsgroups_train.target_names, newsgroups_test.target_names

In [8]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    for text in texts:
        layer = np.zeros(total_words,dtype=float)
        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
            
        batches.append(layer)
        
    for category in categories:
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        else:
            index_y = 2
        results.append(index_y)
            
     
    return np.array(batches),np.array(results)

In [9]:
# Parameters
learning_rate = 0.01
num_epochs = 10
batch_size = 150
display_step = 1

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features
input_size = total_words # Words in vocab
num_classes = 3         # Categories: graphics, sci.space and baseball

In [10]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [11]:
class OurNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(OurNet, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)
 
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

In [12]:
# input [batch_size, n_labels]
# output [max index for each item in batch, ... ,batch_size-1]
loss = nn.CrossEntropyLoss()
input = Variable(torch.randn(2, 5), requires_grad=True)
print(">>> batch of size 2 and 5 possible classes")
print(input)
target = Variable(torch.LongTensor(2).random_(5))
print(">>> array of size 'batch_size' with the index of the maxium label for each item")
print(target)
output = loss(input, target)
output.backward()

>>> batch of size 2 and 5 possible classes
tensor([[-0.1876,  0.3852, -0.1482, -0.7646,  1.9844],
        [-2.2591, -0.4216,  1.6578, -0.9174,  1.1429]], requires_grad=True)
>>> array of size 'batch_size' with the index of the maxium label for each item
tensor([1, 2])


In [13]:
net = OurNet(input_size, hidden_size, num_classes)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  

# Train the Model
for epoch in range(num_epochs):
    total_batch = int(len(newsgroups_train.data)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
        articles = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        #print("articles",articles)
        #print(batch_x, labels)
        #print("size labels",labels.size())
        
        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = net(articles)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if (i+1) % 4 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
                   %(epoch+1, num_epochs, i+1, len(newsgroups_train.data)//batch_size, loss.item()))

Epoch [1/10], Step [4/11], Loss: 1.2230
Epoch [1/10], Step [8/11], Loss: 0.4193
Epoch [2/10], Step [4/11], Loss: 0.0165
Epoch [2/10], Step [8/11], Loss: 0.0070
Epoch [3/10], Step [4/11], Loss: 0.2926
Epoch [3/10], Step [8/11], Loss: 0.0042
Epoch [4/10], Step [4/11], Loss: 0.0003
Epoch [4/10], Step [8/11], Loss: 0.0001
Epoch [5/10], Step [4/11], Loss: 0.0000
Epoch [5/10], Step [8/11], Loss: 0.0000
Epoch [6/10], Step [4/11], Loss: 0.0000
Epoch [6/10], Step [8/11], Loss: 0.0001
Epoch [7/10], Step [4/11], Loss: 0.0000
Epoch [7/10], Step [8/11], Loss: 0.0000
Epoch [8/10], Step [4/11], Loss: 0.0000
Epoch [8/10], Step [8/11], Loss: 0.0000
Epoch [9/10], Step [4/11], Loss: 0.0000
Epoch [9/10], Step [8/11], Loss: 0.0000
Epoch [10/10], Step [4/11], Loss: 0.0000
Epoch [10/10], Step [8/11], Loss: 0.0000


In [14]:
# Test the Model
correct = 0
total = 0
total_test_data = len(newsgroups_test.target)
batch_x_test,batch_y_test = get_batch(newsgroups_test,0,total_test_data)
articles = Variable(torch.FloatTensor(batch_x_test))
labels = torch.LongTensor(batch_y_test)
outputs = net(articles)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
    
print('Accuracy of the network on the 1180 test articles: %d %%' % (100 * correct / total))

Accuracy of the network on the 1180 test articles: 95 %


### Generate set of counterfactuals

In [15]:
from mlexplain.sce.util import choose_k_top_elements_softmax

In [16]:
# Choose 10 objects and display them
X_class_0 = choose_k_top_elements_softmax(net, X_train, K=10, target_class=0)

In [17]:
from mlexplain.sce.text_sce import TextSCE

### Experiment 1. LR=0.1, lambda_coef=0, mu_coef=0

In [18]:
sce = TextSCE(net, target_class=0, index2word=index2word)

In [19]:
sce.fit(X_class_0, lambda_coef=0.0, mu_coef=0.0, n_iter=30, verbose_every_iterations=5, lr=0.1, force_masks_init=False)

[0/30] Cost: 248.14566040039062 [248.14566040039062, 0.0, 0.0]
[5/30] Cost: 161.62733459472656 [161.62733459472656, 0.0, 0.0]
[10/30] Cost: 89.24542999267578 [89.24542999267578, 0.0, 0.0]
[15/30] Cost: 33.3050651550293 [33.3050651550293, 0.0, 0.0]
[20/30] Cost: -17.490846633911133 [-17.490846633911133, 0.0, 0.0]
[25/30] Cost: -69.3678207397461 [-69.3678207397461, 0.0, 0.0]


<mlexplain.sce.text_sce.TextSCE at 0x7f2191c5ea20>

In [22]:
sce.top_k_words_all_masks(k=5)

[['video', 'graphics', 'algorithm', 'rgb', '256'],
 ['video', 'algorithm', 'graphics', 'rgb', '256'],
 ['video', 'rgb', 'algorithm', '256', 'graphics'],
 ['video', 'rgb', 'algorithm', '256', 'graphics'],
 ['graphics', 'vga', 'algorithm', 'package', 'earth\norganization:'],
 ['graphics', 'image', 'package', 'earth\norganization:', 'vga'],
 ['graphics', 'video', 'algorithm', 'output', 'vga'],
 ['graphics', 'image', 'package', 'algorithm', 'rumours'],
 ['graphics', 'image', 'rumours', 'earth\norganization:', 'package'],
 ['video', 'rgb', '256', 'algorithm', 'output']]

In [21]:
sce.top_k_words()

['graphics',
 'video',
 'algorithm',
 'image',
 'rgb',
 'vga',
 'output',
 '256',
 'package',
 'rumours']

### Experiment 2. LR=0.1, lambda_coef=0.05, mu_coef=1.0

In [25]:
sce2 = TextSCE(net, target_class=0, index2word=index2word)

In [32]:
sce2.fit(X_class_0, lambda_coef=0.05, mu_coef=1.0, n_iter=30, verbose_every_iterations=5, lr=0.1, force_masks_init=False)

[0/30] Cost: 21.486618041992188 [-51.99699020385742, 68.87285614013672, 4.610751152038574]
[5/30] Cost: -1.4232783317565918 [-83.805908203125, 77.12258911132812, 5.260040760040283]
[10/30] Cost: -23.11009979248047 [-114.20014953613281, 85.22518157958984, 5.864867687225342]
[15/30] Cost: -42.88528060913086 [-142.19064331054688, 92.87776947021484, 6.427592754364014]
[20/30] Cost: -61.957679748535156 [-169.232666015625, 100.3377914428711, 6.937195301055908]
[25/30] Cost: -81.04022979736328 [-196.06307983398438, 107.60064697265625, 7.4222025871276855]


<mlexplain.sce.text_sce.TextSCE at 0x7f218fd11438>

In [33]:
sce2.top_k_words_all_masks(k=5)

[['video', 'graphics', 'algorithm', 'rgb', '256'],
 ['video', 'graphics', 'algorithm', 'rgb', '256'],
 ['video', 'rgb', 'algorithm', 'graphics', '256'],
 ['video', 'rgb', 'graphics', 'algorithm', '256'],
 ['graphics', 'video', 'algorithm', 'vga', 'package'],
 ['graphics', 'video', 'algorithm', 'vga', 'image'],
 ['graphics', 'video', 'algorithm', 'rgb', 'vga'],
 ['graphics', 'video', 'algorithm', 'image', 'vga'],
 ['graphics', 'video', 'algorithm', 'image', 'rgb'],
 ['video', 'rgb', 'algorithm', 'graphics', '256']]

In [35]:
sce2.top_k_words(k=10)

['graphics',
 'video',
 'algorithm',
 'rgb',
 '256',
 'vga',
 'output',
 'image',
 'color',
 'rumours']

### Experiment 3. Top-10 words to remove. LR=0.1, lambda_coef=0.05, mu_coef=1.0

In [41]:
K = 10
sces = []
for target_class in range(3):
    print(f"Processing class {target_class}")
    X_class_target = choose_k_top_elements_softmax(net, X_train, K=10, target_class=target_class)
    sce = TextSCE(net, target_class=target_class, index2word=index2word)
    sce.fit(X_class_target, lambda_coef=0.05, mu_coef=1.0, n_iter=30, verbose_every_iterations=10, lr=0.1, force_masks_init=False)
    sces.append(sce)

Processing class 0
[0/30] Cost: 162.37269592285156 [162.37269592285156, 0.0, 0.0]
[10/30] Cost: 102.63162231445312 [86.31446838378906, 15.746478080749512, 0.5706777572631836]
[20/30] Cost: 62.11653518676758 [33.674591064453125, 27.293502807617188, 1.148440957069397]
Processing class 1
[0/30] Cost: 117.89676666259766 [117.89676666259766, 0.0, 0.0]
[10/30] Cost: 49.15977478027344 [32.2730712890625, 16.23055076599121, 0.6561494469642639]
[20/30] Cost: 0.20409274101257324 [-29.9215145111084, 29.08087730407715, 1.0447299480438232]
Processing class 2
[0/30] Cost: 131.72915649414062 [131.72915649414062, 0.0, 0.0]
[10/30] Cost: 120.14290618896484 [112.7553939819336, 6.968369483947754, 0.41914474964141846]
[20/30] Cost: 98.84241485595703 [83.41912841796875, 14.365397453308105, 1.057888388633728]


In [53]:
for target_class, sce in enumerate(sces):
    print(f"Target class: {target_class}. Top words:")
    print(sce.top_k_words(k=10))

Target class: 0. Top words:
['graphics', 'video', 'algorithm', 'vga', 'rgb', 'image', 'output', '256', 'package', 'rumours']
Target class: 1. Top words:
['baseball', 'jewish', 'phillies', 'sox', 'stats', 'pitcher', 'fan', 'ryan', 'win', 'yankee']
Target class: 2. Top words:
['planets', 'orbit', 'observatory', 'sci\nlines:', 'prb@access.digex.com', 'temporary', 'moon', 'nasa', 'orbit,', '(pat)\nsubject:']


In [62]:
for idx, x in enumerate(data_train):
    if 'sci\nlines' in x.lower():
        print(idx, x)

23 Organization: ESOC European Space Operations Centre
From: <TNEDDERH@ESOC.BITNET>
Subject: Re: Apollo Training in Iceland
Distribution: sci
Lines: 10

The Apollo astronauts also trained at (in) Meteor Crater in the Flagstaff
area (Arizona).  There is now a museum with a space shop.
Caution: they ease you by 6$. Compared to a KSC visit it's not worth.

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
Thorsten Nedderhut             |  Disclaimer:
mbp Software & Systems GmbH    |
c/o ESA/ESOC/FCSD/OAD/STB      |  Neither ESA nor mbp is responsible
Darmstadt, Germany             |  for my postings!
tnedderh@esoc.bitnet           |

82 From: tholen@galileo.ifa.hawaii.edu (Dave Tholen)
Subject: Re: New planet/Kuiper object found?
Organization: University of Hawaii
Distribution: sci
Lines: 18

Francisco da Fonseca Rodrigues writes:

> 	Tonigth a TV journal here in Brasil announced that an object,
> beyond Pluto's orbit, was found by an observatory at Hawaii. They
> na

In [63]:
print(data_train[23])

Organization: ESOC European Space Operations Centre
From: <TNEDDERH@ESOC.BITNET>
Subject: Re: Apollo Training in Iceland
Distribution: sci
Lines: 10

The Apollo astronauts also trained at (in) Meteor Crater in the Flagstaff
area (Arizona).  There is now a museum with a space shop.
Caution: they ease you by 6$. Compared to a KSC visit it's not worth.

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
Thorsten Nedderhut             |  Disclaimer:
mbp Software & Systems GmbH    |
c/o ESA/ESOC/FCSD/OAD/STB      |  Neither ESA nor mbp is responsible
Darmstadt, Germany             |  for my postings!
tnedderh@esoc.bitnet           |

