In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

import os
import pathlib

In [2]:
data_path = keras.utils.get_file(
    'news20.tar.gz',
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",    untar=True
)

Downloading data from http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz


In [3]:
data_dir = pathlib.Path(data_path).parent / "20_newsgroup"

In [4]:
dirnames = os.listdir(data_dir)

In [5]:
print('no. of directories: {}'.format(len(dirnames)))
print('directory names:')
for d in dirnames:
    print('\t', d)

no. of directories: 20
directory names:
	 comp.os.ms-windows.misc
	 misc.forsale
	 talk.religion.misc
	 alt.atheism
	 soc.religion.christian
	 talk.politics.guns
	 sci.med
	 rec.sport.hockey
	 sci.electronics
	 sci.space
	 comp.sys.ibm.pc.hardware
	 rec.motorcycles
	 comp.graphics
	 talk.politics.misc
	 comp.sys.mac.hardware
	 rec.autos
	 sci.crypt
	 comp.windows.x
	 rec.sport.baseball
	 talk.politics.mideast


In [6]:
fnames = os.listdir(data_dir / "comp.graphics")
print("no. of files in comp.graphics: {}".format(len(fnames)))
print("some example filenames: ")
for f in fnames[:5]:
    print('\t', f)

no. of files in comp.graphics: 1000
some example filenames: 
	 38543
	 39488
	 38348
	 39659
	 38949


In [7]:
with open(data_dir / "comp.graphics" / "38987") as f:
    print(f.read())

Newsgroups: comp.graphics
Path: cantaloupe.srv.cs.cmu.edu!das-news.harvard.edu!noc.near.net!howland.reston.ans.net!agate!dog.ee.lbl.gov!network.ucsd.edu!usc!rpi!nason110.its.rpi.edu!mabusj
From: mabusj@nason110.its.rpi.edu (Jasen M. Mabus)
Subject: Looking for Brain in CAD
Message-ID: <c285m+p@rpi.edu>
Nntp-Posting-Host: nason110.its.rpi.edu
Reply-To: mabusj@rpi.edu
Organization: Rensselaer Polytechnic Institute, Troy, NY.
Date: Thu, 29 Apr 1993 23:27:20 GMT
Lines: 7

Jasen Mabus
RPI student

	I am looking for a hman brain in any CAD (.dxf,.cad,.iges,.cgm,etc.) or picture (.gif,.jpg,.ras,etc.) format for an animation demonstration. If any has or knows of a location please reply by e-mail to mabusj@rpi.edu.

Thank you in advance,
Jasen Mabus  



In [8]:
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))


Processing alt.atheism, 1000 files found
Processing comp.graphics, 1000 files found
Processing comp.os.ms-windows.misc, 1000 files found
Processing comp.sys.ibm.pc.hardware, 1000 files found
Processing comp.sys.mac.hardware, 1000 files found
Processing comp.windows.x, 1000 files found
Processing misc.forsale, 1000 files found
Processing rec.autos, 1000 files found
Processing rec.motorcycles, 1000 files found
Processing rec.sport.baseball, 1000 files found
Processing rec.sport.hockey, 1000 files found
Processing sci.crypt, 1000 files found
Processing sci.electronics, 1000 files found
Processing sci.med, 1000 files found
Processing sci.space, 1000 files found
Processing soc.religion.christian, 997 files found
Processing talk.politics.guns, 1000 files found
Processing talk.politics.mideast, 1000 files found
Processing talk.politics.misc, 1000 files found
Processing talk.religion.misc, 1000 files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.ha

In [11]:
print('length of samples: {}'.format(len(samples)))
print('a sample: ', samples[2])

length of samples: 19997
a sample:  
mam@mouse.cmhnet.org (Mike McAngus) writes:

>Let me see if I understand what you are saying.  In order to talk 
>knowledgeably about religion, Atheists must first have been so immersed 
>in a religion that only the rare individual could have left.  

No, you don't understand.  I said that I don't think people can discuss
the subjective merits of religion objectively.  This should be obvious.
People here have said that everyone would be better off without religion,
but this almost certainly isn't true.

>>But really, are you threatened by the motto, or by the people that use it?
>The motto is a tool.  Let's try to take away the tool.

But, guns and axes are tools, both of which have been used for murder.
Should both be taken away?  That is to say, I don't think motto misuse
warrants its removal.  At least not in this case.

keith



In [19]:
# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]


In [20]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


dummy_data = ["my name is james", "his name is jennifer", "i eat pickles"]
dummy_labels = [1, 1, 0]
dummy_tensors = tf.data.Dataset.from_tensor_slices((dummy_data, dummy_labels))

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)

vectorizer.adapt(dummy_tensors.map(lambda text, label: text))
# vectorizer.adapt(dummy_tensors)
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'name',
 'is',
 'pickles',
 'my',
 'jennifer',
 'james',
 'i',
 'his',
 'eat']

In [19]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)



NameError: name 'train_samples' is not defined

In [21]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'to', 'of']

In [22]:
vectorizer([["the cat sat on the mat"]])

<tf.Tensor: shape=(1, 200), dtype=int64, numpy=
array([[   2, 3705, 1691,   15,    2, 5371,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   

In [24]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [26]:
word_index

{'': 0,
 '[UNK]': 1,
 'the': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'and': 6,
 'in': 7,
 'is': 8,
 'i': 9,
 'that': 10,
 'it': 11,
 'for': 12,
 'you': 13,
 'this': 14,
 'on': 15,
 'be': 16,
 'not': 17,
 'are': 18,
 'have': 19,
 'with': 20,
 'as': 21,
 'or': 22,
 'if': 23,
 'was': 24,
 'but': 25,
 'they': 26,
 'from': 27,
 'by': 28,
 'at': 29,
 'an': 30,
 'my': 31,
 'what': 32,
 'can': 33,
 'would': 34,
 'all': 35,
 'will': 36,
 'there': 37,
 'one': 38,
 'do': 39,
 'writes': 40,
 'about': 41,
 'we': 42,
 'so': 43,
 'he': 44,
 'has': 45,
 'your': 46,
 'no': 47,
 'article': 48,
 'any': 49,
 'me': 50,
 'some': 51,
 'who': 52,
 'which': 53,
 'were': 54,
 'its': 55,
 'out': 56,
 'dont': 57,
 'people': 58,
 'when': 59,
 'like': 60,
 'more': 61,
 'just': 62,
 'their': 63,
 '1': 64,
 'know': 65,
 'other': 66,
 'them': 67,
 'up': 68,
 'how': 69,
 'only': 70,
 'get': 71,
 'had': 72,
 'than': 73,
 'x': 74,
 'been': 75,
 'lines': 76,
 'think': 77,
 'his': 78,
 '2': 79,
 'also': 80,
 'does': 81,
 'then': 

In [27]:
test = ["the", "cat", "sat", "on", "the", "mat"]
[word_index[w] for w in test]


[2, 3705, 1691, 15, 2, 5371]

In [29]:
path_to_glove_file = os.path.join('.', 'glove.6B.100d.txt')

In [30]:
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

In [31]:
print('Found {} word vectors.'.format(len(embeddings_index)))

Found 400000 word vectors.


In [41]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim)) # provide a vector of zeros where the word doesn't exist
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        print(i)
        misses += 1

print("converted {} words ({} misses)".format(hits, misses))

0
1
148
208
1055
1353
1361
1386
1412
1578
1726
1807
1832
1917
1959
2039
2086
2110
2126
2172
2199
2381
2476
2508
2546
2614
2735
2817
2862
2932
2937
3051
3052
3066
3097
3136
3198
3213
3217
3265
3375
3399
3414
3500
3528
3637
3736
3778
3867
3934
3973
3991
4040
4069
4103
4248
4264
4329
4385
4387
4493
4587
4599
4603
4631
4668
4691
4785
4880
4897
4955
4966
5010
5015
5027
5034
5062
5077
5087
5125
5134
5154
5187
5220
5223
5232
5236
5237
5249
5307
5334
5348
5395
5396
5404
5426
5428
5430
5470
5475
5693
5747
5774
5798
5800
5805
5849
5860
5895
5898
5908
5919
5928
5944
5949
5950
5953
5994
6050
6052
6054
6069
6075
6076
6078
6117
6121
6140
6156
6199
6236
6245
6261
6323
6335
6345
6353
6359
6376
6378
6427
6438
6440
6453
6454
6487
6493
6504
6519
6597
6606
6615
6634
6686
6706
6732
6753
6754
6760
6762
6788
6810
6811
6819
6820
6845
6867
6893
6899
6900
6912
6940
6978
6981
7004
7011
7026
7027
7042
7045
7069
7091
7106
7122
7123
7140
7242
7249
7261
7306
7342
7344
7378
7385
7430
7439
7443
7461
7462
7478
7479
749

18086
18090
18093
18096
18097
18098
18099
18100
18101
18103
18104
18107
18115
18137
18138
18140
18141
18143
18148
18153
18156
18157
18158
18171
18172
18174
18176
18182
18198
18201
18206
18207
18209
18217
18220
18221
18246
18247
18249
18251
18252
18259
18262
18275
18296
18298
18300
18310
18315
18321
18322
18323
18357
18367
18369
18370
18371
18372
18376
18377
18384
18408
18413
18414
18415
18417
18423
18424
18429
18430
18440
18450
18451
18453
18456
18467
18473
18483
18488
18493
18507
18509
18510
18512
18515
18520
18537
18541
18546
18550
18555
18558
18561
18563
18574
18588
18589
18590
18596
18597
18599
18602
18604
18606
18610
18611
18612
18613
18614
18618
18619
18620
18622
18624
18626
18627
18628
18630
18631
18635
18638
18643
18646
18648
18657
18658
18662
18664
18665
18666
18671
18672
18673
18674
18675
18676
18677
18678
18679
18680
18681
18683
18684
18685
18686
18687
18688
18689
18691
18699
18701
18705
18713
18714
18717
18724
18732
18734
18744
18747
18748
18749
18750
18773
18778
18780
1878

In [44]:
embedding_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [40]:
word_index

{'': 0,
 '[UNK]': 1,
 'the': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'and': 6,
 'in': 7,
 'is': 8,
 'i': 9,
 'that': 10,
 'it': 11,
 'for': 12,
 'you': 13,
 'this': 14,
 'on': 15,
 'be': 16,
 'not': 17,
 'are': 18,
 'have': 19,
 'with': 20,
 'as': 21,
 'or': 22,
 'if': 23,
 'was': 24,
 'but': 25,
 'they': 26,
 'from': 27,
 'by': 28,
 'at': 29,
 'an': 30,
 'my': 31,
 'what': 32,
 'can': 33,
 'would': 34,
 'all': 35,
 'will': 36,
 'there': 37,
 'one': 38,
 'do': 39,
 'writes': 40,
 'about': 41,
 'we': 42,
 'so': 43,
 'he': 44,
 'has': 45,
 'your': 46,
 'no': 47,
 'article': 48,
 'any': 49,
 'me': 50,
 'some': 51,
 'who': 52,
 'which': 53,
 'were': 54,
 'its': 55,
 'out': 56,
 'dont': 57,
 'people': 58,
 'when': 59,
 'like': 60,
 'more': 61,
 'just': 62,
 'their': 63,
 '1': 64,
 'know': 65,
 'other': 66,
 'them': 67,
 'up': 68,
 'how': 69,
 'only': 70,
 'get': 71,
 'had': 72,
 'than': 73,
 'x': 74,
 'been': 75,
 'lines': 76,
 'think': 77,
 'his': 78,
 '2': 79,
 'also': 80,
 'does': 81,
 'then': 

In [39]:
embeddings_index.get('hello')

array([ 0.26688  ,  0.39632  ,  0.6169   , -0.77451  , -0.1039   ,
        0.26697  ,  0.2788   ,  0.30992  ,  0.0054685, -0.085256 ,
        0.73602  , -0.098432 ,  0.5479   , -0.030305 ,  0.33479  ,
        0.14094  , -0.0070003,  0.32569  ,  0.22902  ,  0.46557  ,
       -0.19531  ,  0.37491  , -0.7139   , -0.51775  ,  0.77039  ,
        1.0881   , -0.66011  , -0.16234  ,  0.9119   ,  0.21046  ,
        0.047494 ,  1.0019   ,  1.1133   ,  0.70094  , -0.08696  ,
        0.47571  ,  0.1636   , -0.44469  ,  0.4469   , -0.93817  ,
        0.013101 ,  0.085964 , -0.67456  ,  0.49662  , -0.037827 ,
       -0.11038  , -0.28612  ,  0.074606 , -0.31527  , -0.093774 ,
       -0.57069  ,  0.66865  ,  0.45307  , -0.34154  , -0.7166   ,
       -0.75273  ,  0.075212 ,  0.57903  , -0.1191   , -0.11379  ,
       -0.10026  ,  0.71341  , -1.1574   , -0.74026  ,  0.40452  ,
        0.18023  ,  0.21449  ,  0.37638  ,  0.11239  , -0.53639  ,
       -0.025092 ,  0.31886  , -0.25013  , -0.63283  , -0.0118

In [45]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False
)

In [49]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")

In [51]:
embedding_layer(int_sequences_input)

<KerasTensor: shape=(None, None, 100) dtype=float32 (created by layer 'embedding')>

In [56]:
def uncompiled_model():
    model = keras.Sequential([
        Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False),
        layers.Conv1D(128, 5, activation='relu'),
        layers.MaxPooling1D(5),
        layers.Conv1D(128, 5, activation="relu"),
        layers.MaxPooling1D(5),
        layers.Conv1D(128, 5, activation="relu"),
        layers.GlobalMaxPooling1D(),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(len(class_names), activation='softmax')
    ])
    
    return model

In [57]:
model = uncompiled_model()

In [58]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)


In [59]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc", keras.metrics.SparseCategoricalAccuracy()]
)
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f239ece6390>

In [72]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [["I can't be certain where this is going"]]
)

class_names[np.argmax(probabilities[0])]


'misc.forsale'

In [73]:
np.argmax(probabilities[0])

6

In [74]:
probabilities[0]

array([0.02323702, 0.06950156, 0.07116769, 0.06658573, 0.03970981,
       0.09495005, 0.11668595, 0.03771921, 0.0624009 , 0.04748211,
       0.06385873, 0.01402348, 0.09036122, 0.06735771, 0.03721245,
       0.02118146, 0.02165251, 0.01740202, 0.02142825, 0.01608213],
      dtype=float32)