In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

import os
import pathlib

In [2]:
data_path = keras.utils.get_file(
    'news20.tar.gz',
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",    untar=True
)

In [3]:
data_dir = pathlib.Path(data_path).parent / "20_newsgroup"

In [4]:
dirnames = os.listdir(data_dir)

In [5]:
print('no. of directories: {}'.format(len(dirnames)))
print('directory names:')
for d in dirnames:
    print('\t', d)

no. of directories: 20
directory names:
	 comp.os.ms-windows.misc
	 misc.forsale
	 talk.religion.misc
	 alt.atheism
	 soc.religion.christian
	 talk.politics.guns
	 sci.med
	 rec.sport.hockey
	 sci.electronics
	 sci.space
	 comp.sys.ibm.pc.hardware
	 rec.motorcycles
	 comp.graphics
	 talk.politics.misc
	 comp.sys.mac.hardware
	 rec.autos
	 sci.crypt
	 comp.windows.x
	 rec.sport.baseball
	 talk.politics.mideast


In [6]:
fnames = os.listdir(data_dir / "comp.graphics")
print("no. of files in comp.graphics: {}".format(len(fnames)))
print("some example filenames: ")
for f in fnames[:5]:
    print('\t', f)

no. of files in comp.graphics: 1000
some example filenames: 
	 38543
	 39488
	 38348
	 39659
	 38949


In [7]:
with open(data_dir / "comp.graphics" / "38987") as f:
    print(f.read())

Newsgroups: comp.graphics
Path: cantaloupe.srv.cs.cmu.edu!das-news.harvard.edu!noc.near.net!howland.reston.ans.net!agate!dog.ee.lbl.gov!network.ucsd.edu!usc!rpi!nason110.its.rpi.edu!mabusj
From: mabusj@nason110.its.rpi.edu (Jasen M. Mabus)
Subject: Looking for Brain in CAD
Message-ID: <c285m+p@rpi.edu>
Nntp-Posting-Host: nason110.its.rpi.edu
Reply-To: mabusj@rpi.edu
Organization: Rensselaer Polytechnic Institute, Troy, NY.
Date: Thu, 29 Apr 1993 23:27:20 GMT
Lines: 7

Jasen Mabus
RPI student

	I am looking for a hman brain in any CAD (.dxf,.cad,.iges,.cgm,etc.) or picture (.gif,.jpg,.ras,etc.) format for an animation demonstration. If any has or knows of a location please reply by e-mail to mabusj@rpi.edu.

Thank you in advance,
Jasen Mabus  



In [8]:
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))


Processing alt.atheism, 1000 files found
Processing comp.graphics, 1000 files found
Processing comp.os.ms-windows.misc, 1000 files found
Processing comp.sys.ibm.pc.hardware, 1000 files found
Processing comp.sys.mac.hardware, 1000 files found
Processing comp.windows.x, 1000 files found
Processing misc.forsale, 1000 files found
Processing rec.autos, 1000 files found
Processing rec.motorcycles, 1000 files found
Processing rec.sport.baseball, 1000 files found
Processing rec.sport.hockey, 1000 files found
Processing sci.crypt, 1000 files found
Processing sci.electronics, 1000 files found
Processing sci.med, 1000 files found
Processing sci.space, 1000 files found
Processing soc.religion.christian, 997 files found
Processing talk.politics.guns, 1000 files found
Processing talk.politics.mideast, 1000 files found
Processing talk.politics.misc, 1000 files found
Processing talk.religion.misc, 1000 files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.ha

In [9]:
print('length of samples: {}'.format(len(samples)))
print('a sample: ', samples[2])

length of samples: 19997
a sample:  
mam@mouse.cmhnet.org (Mike McAngus) writes:

>Let me see if I understand what you are saying.  In order to talk 
>knowledgeably about religion, Atheists must first have been so immersed 
>in a religion that only the rare individual could have left.  

No, you don't understand.  I said that I don't think people can discuss
the subjective merits of religion objectively.  This should be obvious.
People here have said that everyone would be better off without religion,
but this almost certainly isn't true.

>>But really, are you threatened by the motto, or by the people that use it?
>The motto is a tool.  Let's try to take away the tool.

But, guns and axes are tools, both of which have been used for murder.
Should both be taken away?  That is to say, I don't think motto misuse
warrants its removal.  At least not in this case.

keith



In [10]:
# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]


In [11]:
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


# dummy_data = ["my name is james", "his name is jennifer", "i eat pickles"]
# dummy_labels = [1, 1, 0]
# dummy_tensors = tf.data.Dataset.from_tensor_slices((dummy_data, dummy_labels))

# vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)

# vectorizer.adapt(dummy_tensors.map(lambda text, label: text))
# # vectorizer.adapt(dummy_tensors)
# vectorizer.get_vocabulary()

In [12]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)



In [13]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'to', 'of']

In [14]:
vectorizer([["the cat sat on the mat"]])

<tf.Tensor: shape=(1, 200), dtype=int64, numpy=
array([[   2, 3705, 1691,   15,    2, 5371,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   

In [15]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [16]:
word_index

{'': 0,
 '[UNK]': 1,
 'the': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'and': 6,
 'in': 7,
 'is': 8,
 'i': 9,
 'that': 10,
 'it': 11,
 'for': 12,
 'you': 13,
 'this': 14,
 'on': 15,
 'be': 16,
 'not': 17,
 'are': 18,
 'have': 19,
 'with': 20,
 'as': 21,
 'or': 22,
 'if': 23,
 'was': 24,
 'but': 25,
 'they': 26,
 'from': 27,
 'by': 28,
 'at': 29,
 'an': 30,
 'my': 31,
 'what': 32,
 'can': 33,
 'would': 34,
 'all': 35,
 'will': 36,
 'there': 37,
 'one': 38,
 'do': 39,
 'writes': 40,
 'about': 41,
 'we': 42,
 'so': 43,
 'he': 44,
 'has': 45,
 'your': 46,
 'no': 47,
 'article': 48,
 'any': 49,
 'me': 50,
 'some': 51,
 'who': 52,
 'which': 53,
 'were': 54,
 'its': 55,
 'out': 56,
 'dont': 57,
 'people': 58,
 'when': 59,
 'like': 60,
 'more': 61,
 'just': 62,
 'their': 63,
 '1': 64,
 'know': 65,
 'other': 66,
 'them': 67,
 'up': 68,
 'how': 69,
 'only': 70,
 'get': 71,
 'had': 72,
 'than': 73,
 'x': 74,
 'been': 75,
 'lines': 76,
 'think': 77,
 'his': 78,
 '2': 79,
 'also': 80,
 'does': 81,
 'then': 

In [17]:
test = ["the", "cat", "sat", "on", "the", "mat"]
[word_index[w] for w in test]


[2, 3705, 1691, 15, 2, 5371]

In [18]:
path_to_glove_file = os.path.join('/embeddings', 'glove.twitter.27B.100d.txt')

In [19]:
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

In [20]:
print('Found {} word vectors.'.format(len(embeddings_index)))

Found 1193514 word vectors.


In [21]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim)) # provide a vector of zeros where the word doesn't exist
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        print(i)
        misses += 1

print("converted {} words ({} misses)".format(hits, misses))

0
1
64
79
99
114
130
134
148
155
203
204
208
244
246
247
298
319
324
327
349
372
386
421
447
480
489
497
498
502
512
514
519
544
596
599
692
733
744
811
816
838
843
872
939
963
972
1005
1052
1115
1120
1124
1184
1194
1218
1332
1351
1353
1358
1361
1385
1386
1410
1416
1423
1432
1437
1485
1515
1555
1575
1578
1580
1591
1600
1629
1639
1654
1677
1690
1711
1726
1749
1754
1791
1795
1806
1807
1815
1832
1857
1901
1903
1908
1909
1916
1939
1946
1968
1985
1998
2007
2086
2088
2110
2126
2172
2195
2196
2218
2239
2261
2300
2313
2327
2338
2339
2340
2376
2381
2411
2412
2427
2444
2471
2472
2476
2508
2546
2556
2576
2611
2614
2637
2655
2689
2690
2705
2715
2716
2735
2754
2755
2756
2772
2773
2789
2792
2817
2837
2884
2937
2946
2947
2948
2984
2985
3030
3050
3051
3052
3055
3066
3085
3097
3114
3115
3116
3135
3154
3171
3188
3198
3217
3248
3249
3265
3266
3286
3356
3357
3375
3399
3414
3416
3417
3434
3463
3467
3474
3488
3500
3511
3512
3513
3518
3528
3584
3637
3660
3682
3710
3712
3732
3736
3772
3810
3832
3850
3862
3863

15113
15124
15133
15134
15156
15168
15173
15174
15179
15182
15183
15187
15189
15210
15215
15226
15241
15242
15249
15250
15252
15255
15256
15263
15272
15276
15289
15290
15297
15298
15306
15311
15313
15315
15323
15324
15326
15336
15343
15344
15356
15357
15358
15360
15374
15375
15377
15382
15385
15386
15387
15392
15403
15409
15425
15438
15450
15466
15471
15472
15478
15481
15485
15515
15537
15548
15549
15561
15563
15565
15566
15580
15594
15604
15618
15637
15648
15655
15662
15673
15678
15680
15685
15686
15689
15696
15725
15739
15740
15741
15742
15743
15744
15745
15746
15747
15748
15749
15750
15751
15752
15753
15754
15755
15756
15757
15758
15759
15760
15761
15762
15763
15764
15765
15766
15767
15768
15769
15770
15771
15772
15773
15774
15775
15776
15777
15778
15779
15780
15781
15782
15783
15784
15785
15786
15787
15788
15789
15790
15791
15792
15793
15794
15795
15796
15797
15799
15802
15803
15804
15806
15807
15808
15813
15833
15838
15852
15873
15877
15890
15895
15896
15897
15898
15914
15920
1592

In [22]:
embedding_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [23]:
word_index

{'': 0,
 '[UNK]': 1,
 'the': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'and': 6,
 'in': 7,
 'is': 8,
 'i': 9,
 'that': 10,
 'it': 11,
 'for': 12,
 'you': 13,
 'this': 14,
 'on': 15,
 'be': 16,
 'not': 17,
 'are': 18,
 'have': 19,
 'with': 20,
 'as': 21,
 'or': 22,
 'if': 23,
 'was': 24,
 'but': 25,
 'they': 26,
 'from': 27,
 'by': 28,
 'at': 29,
 'an': 30,
 'my': 31,
 'what': 32,
 'can': 33,
 'would': 34,
 'all': 35,
 'will': 36,
 'there': 37,
 'one': 38,
 'do': 39,
 'writes': 40,
 'about': 41,
 'we': 42,
 'so': 43,
 'he': 44,
 'has': 45,
 'your': 46,
 'no': 47,
 'article': 48,
 'any': 49,
 'me': 50,
 'some': 51,
 'who': 52,
 'which': 53,
 'were': 54,
 'its': 55,
 'out': 56,
 'dont': 57,
 'people': 58,
 'when': 59,
 'like': 60,
 'more': 61,
 'just': 62,
 'their': 63,
 '1': 64,
 'know': 65,
 'other': 66,
 'them': 67,
 'up': 68,
 'how': 69,
 'only': 70,
 'get': 71,
 'had': 72,
 'than': 73,
 'x': 74,
 'been': 75,
 'lines': 76,
 'think': 77,
 'his': 78,
 '2': 79,
 'also': 80,
 'does': 81,
 'then': 

In [24]:
embeddings_index.get('hello')

array([ 0.55793  ,  0.10748  , -0.57491  ,  0.4877   , -0.37792  ,
       -0.036457 ,  1.0581   ,  0.059584 , -0.19582  , -0.41366  ,
        0.054969 ,  0.10674  , -2.7076   , -0.50818  , -0.47456  ,
        0.32746  ,  0.41643  , -0.53607  , -0.24822  , -0.63456  ,
       -0.075781 , -1.1904   , -0.72504  ,  0.19499  ,  0.029645 ,
       -0.98157  ,  0.27081  ,  0.32472  ,  0.51154  , -0.86702  ,
       -0.36342  ,  0.14098  , -0.44251  ,  0.24804  ,  0.14021  ,
       -0.042186 ,  0.10408  ,  0.23267  ,  0.26663  ,  0.40316  ,
       -0.91011  ,  0.049339 ,  0.14842  ,  0.70496  , -0.013448 ,
        0.35591  , -0.23494  , -0.83828  ,  0.0069803,  0.44702  ,
       -0.27031  ,  0.0032742,  0.13265  , -0.68583  ,  0.90147  ,
        0.60725  , -0.1849   ,  0.086123 , -0.1693   , -0.48741  ,
        0.33445  , -0.10119  , -0.054273 , -0.35999  , -0.48967  ,
       -0.36699  , -0.91001  , -0.38762  ,  0.14981  ,  0.14092  ,
        0.6064   , -0.2507   ,  0.1582   , -0.33841  , -0.0256

In [25]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False
)

In [26]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")

In [27]:
embedding_layer(int_sequences_input)

<KerasTensor: shape=(None, None, 100) dtype=float32 (created by layer 'embedding')>

In [28]:
def uncompiled_model():
    model = keras.Sequential([
        Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False),
        layers.Conv1D(128, 5, activation='relu'),
        layers.MaxPooling1D(5),
        layers.Conv1D(128, 5, activation="relu"),
        layers.MaxPooling1D(5),
        layers.Conv1D(128, 5, activation="relu"),
        layers.GlobalMaxPooling1D(),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(len(class_names), activation='softmax')
    ])
    
    return model

In [29]:
model = uncompiled_model()

In [30]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)


In [33]:
train_labels
labels

[10,
 15,
 18,
 5,
 15,
 8,
 17,
 12,
 0,
 17,
 11,
 12,
 3,
 10,
 5,
 14,
 0,
 15,
 5,
 10,
 9,
 0,
 4,
 2,
 9,
 19,
 10,
 17,
 0,
 10,
 10,
 15,
 3,
 1,
 12,
 17,
 14,
 6,
 4,
 16,
 10,
 14,
 15,
 15,
 4,
 19,
 9,
 0,
 15,
 13,
 17,
 10,
 8,
 7,
 10,
 2,
 9,
 15,
 18,
 10,
 4,
 14,
 7,
 19,
 6,
 10,
 7,
 9,
 4,
 18,
 7,
 0,
 3,
 15,
 5,
 18,
 7,
 15,
 3,
 17,
 18,
 9,
 17,
 2,
 15,
 13,
 9,
 4,
 7,
 3,
 8,
 1,
 9,
 14,
 1,
 16,
 3,
 14,
 15,
 17,
 12,
 19,
 13,
 3,
 2,
 9,
 2,
 10,
 14,
 9,
 18,
 5,
 2,
 19,
 7,
 10,
 15,
 19,
 5,
 15,
 4,
 13,
 10,
 1,
 10,
 4,
 17,
 11,
 2,
 17,
 6,
 3,
 16,
 11,
 1,
 9,
 0,
 18,
 19,
 0,
 8,
 6,
 13,
 5,
 13,
 9,
 3,
 3,
 8,
 16,
 12,
 16,
 2,
 2,
 7,
 7,
 8,
 16,
 6,
 9,
 1,
 15,
 15,
 1,
 9,
 1,
 16,
 1,
 9,
 6,
 6,
 6,
 3,
 11,
 14,
 2,
 8,
 0,
 19,
 16,
 14,
 6,
 10,
 17,
 11,
 18,
 2,
 6,
 1,
 13,
 3,
 11,
 15,
 14,
 10,
 5,
 2,
 1,
 5,
 18,
 3,
 0,
 5,
 16,
 19,
 2,
 3,
 15,
 3,
 3,
 5,
 5,
 12,
 1,
 6,
 19,
 5,
 15,
 14,
 16,
 10,
 7,
 5,
 

In [59]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc", keras.metrics.SparseCategoricalAccuracy()]
)
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f239ece6390>

In [72]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [["I can't be certain where this is going"]]
)

class_names[np.argmax(probabilities[0])]


'misc.forsale'

In [73]:
np.argmax(probabilities[0])

6

In [74]:
probabilities[0]

array([0.02323702, 0.06950156, 0.07116769, 0.06658573, 0.03970981,
       0.09495005, 0.11668595, 0.03771921, 0.0624009 , 0.04748211,
       0.06385873, 0.01402348, 0.09036122, 0.06735771, 0.03721245,
       0.02118146, 0.02165251, 0.01740202, 0.02142825, 0.01608213],
      dtype=float32)