In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import ktrain
from sklearn.datasets import fetch_20newsgroups

In [2]:
class_names = ['alt.atheism', 'comp.sys.ibm.pc.hardware', 'sci.space', 'talk.politics.guns']
train = fetch_20newsgroups(subset='train', remove=['headers', 'footers', 'quotes'], categories=class_names, shuffle=False)
test = fetch_20newsgroups(subset='test', remove=['headers', 'footers', 'quotes'], categories=class_names, shuffle=False)

In [3]:
train_ds, test_ds, preproc = ktrain.text.texts_from_array(
    x_train=train.data, y_train=train.target,
    x_test=test.data, y_test=test.target,
    preprocess_mode='standard', class_names=class_names, maxlen=600
)

language: en
Word Counts: 30716
Nrows: 2209
2209 train sequences
train sequence lengths:
	mean : 187
	95percentile : 600
	99percentile : 1892
x_train shape: (2209,600)
y_train shape: (2209, 4)
Is Multi-Label? False
1469 test sequences
test sequence lengths:
	mean : 152
	95percentile : 492
	99percentile : 1144
x_test shape: (1469,600)
y_test shape: (1469, 4)
task: text classification


In [4]:
model = ktrain.text.text_classifier('nbsvm', train_data=train_ds, preproc=preproc)

Is Multi-Label? False
compiling word ID features...
maxlen is 600
building document-term matrix... this may take a few moments...
rows: 1-2209
computing log-count ratios...
done.


In [5]:
learner = ktrain.get_learner(model, train_data=train_ds, batch_size=64)

In [6]:
learner.autofit(2e-2, 5, verbose=2)



begin training using triangular learning rate policy with max lr of 0.02...
Epoch 1/5
35/35 - 2s - loss: 0.5141 - accuracy: 0.8728 - 2s/epoch - 60ms/step
Epoch 2/5
35/35 - 0s - loss: 0.1799 - accuracy: 0.9597 - 147ms/epoch - 4ms/step
Epoch 3/5
35/35 - 0s - loss: 0.1359 - accuracy: 0.9688 - 160ms/epoch - 5ms/step
Epoch 4/5
35/35 - 0s - loss: 0.1128 - accuracy: 0.9742 - 171ms/epoch - 5ms/step
Epoch 5/5
35/35 - 0s - loss: 0.0982 - accuracy: 0.9751 - 151ms/epoch - 4ms/step


<keras.callbacks.History at 0x7f401fdb7e20>

In [7]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [8]:
print('Ground truth:', class_names[test.target[12]])
predictor.explain(test.data[12])

Ground truth: sci.space


Contribution?,Feature
2.502,Highlighted in text (sum)
-0.826,<BIAS>


In [9]:
print('Ground truth:', class_names[test.target[5]])
predictor.explain(test.data[5])

Ground truth: comp.sys.ibm.pc.hardware


Contribution?,Feature
5.941,Highlighted in text (sum)
-0.542,<BIAS>


In [10]:
print('Ground truth:', class_names[test.target[32]])
predictor.explain(test.data[32])

Ground truth: alt.atheism


Contribution?,Feature
18.32,Highlighted in text (sum)
-0.484,<BIAS>


In [11]:
print('Ground truth:', class_names[test.target[98]])
predictor.explain(test.data[98])

Ground truth: talk.politics.guns


Contribution?,Feature
1.486,Highlighted in text (sum)
-0.175,<BIAS>


In [12]:
# Source: https://en.wikipedia.org/wiki/Space_exploration
predictor.explain('Space exploration is the use of astronomy and space technology to explore outer space. While the exploration of space is carried out mainly by astronomers with telescopes, its physical exploration though is conducted both by unmanned robotic space probes and human spaceflight. Space exploration, like its classical form astronomy, is one of the main sources for space science. ')

Contribution?,Feature
16.58,Highlighted in text (sum)
-0.32,<BIAS>


In [13]:
# Source: https://en.wikipedia.org/wiki/Gun
predictor.explain('A gun is a ranged weapon designed to use a shooting tube (gun barrel) to launch typically solid projectiles, but can also project pressurized liquid (e.g. water guns/cannons, spray guns for painting or pressure washing, projected water disruptors, and technically also flamethrowers), gas (e.g. light-gas gun) or even charged particles (e.g. plasma gun[disambiguation needed]). Solid projectiles may be free-flying (as with bullets and artillery shells) or tethered (as with Taser guns, spearguns and harpoon guns). A large-caliber gun is also referred to as a cannon.')

Contribution?,Feature
6.298,Highlighted in text (sum)
-0.393,<BIAS>


In [14]:
# Source: https://en.wikipedia.org/wiki/AMD_Phenom
predictor.explain('Phenom is the 64-bit AMD desktop processor line based on the K10 microarchitecture, in what AMD calls family 10h (10 hex, i.e. 16 in normal decimal numbers) processors, sometimes incorrectly called "K10h". Triple-core versions (codenamed Toliman) belong to the Phenom 8000 series and quad cores (codenamed Agena) to the AMD Phenom X4 9000 series. The first processor in the family was released in 2007. ')

Contribution?,Feature
5.292,Highlighted in text (sum)
-0.498,<BIAS>
