In [1]:
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
text = pd.Series(['Hello World!', 'hello world', 'cat', 'world of dogs'])

print(text)

0     Hello World!
1      hello world
2              cat
3    world of dogs
dtype: object


#### Wörter in Zahlen umwandeln

In [3]:
# one_hot(list, size_of_vocabulary)

vocab = text.str.lower().apply(str.split)
voc = []
for i in vocab:
    voc += i
voc = np.array(voc)

print(vocab)
print(voc)
print(len(np.unique(voc)))

vocab_size = len(np.unique(voc))
print(text)
text_indices = text.apply(one_hot, args=[25])
print(text_indices)

0      [hello, world!]
1       [hello, world]
2                [cat]
3    [world, of, dogs]
dtype: object
['hello' 'world!' 'hello' 'world' 'cat' 'world' 'of' 'dogs']
6
0     Hello World!
1      hello world
2              cat
3    world of dogs
dtype: object
0        [6, 24]
1        [6, 24]
2           [10]
3    [24, 1, 19]
dtype: object


#### Alle Sequenzen auf die gleiche Länge bringen

In [4]:
print(text.str.split().apply(len).max())
print(text_indices.apply(len).max())

3
3


In [5]:
maxlen = text_indices.apply(len).max()

padded_indices = pad_sequences(text_indices, maxlen=maxlen, padding='pre')

print(padded_indices)

[[ 0  6 24]
 [ 0  6 24]
 [ 0  0 10]
 [24  1 19]]


In [6]:
# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])

In [7]:
docs_indices = [one_hot(t, 200) for t in docs]
print(docs_indices)

[[95, 191], [37, 127], [187, 115], [85, 127], [138], [140], [94, 115], [122, 37], [94, 127], [59, 61, 191, 68]]


In [8]:
maxlen = 0
for i in docs_indices:
    if len(i) > maxlen:
        maxlen = len(i)
print(maxlen)

padded_indices = pad_sequences(docs_indices, maxlen=maxlen, padding='pre')

print(padded_indices)

4
[[  0   0  95 191]
 [  0   0  37 127]
 [  0   0 187 115]
 [  0   0  85 127]
 [  0   0   0 138]
 [  0   0   0 140]
 [  0   0  94 115]
 [  0   0 122  37]
 [  0   0  94 127]
 [ 59  61 191  68]]


In [9]:
import tensorflow as tf

config = tf.compat.v1.ConfigProto(gpu_options=tf.compat.v1.GPUOptions(
                                    per_process_gpu_memory_fraction=0.8)
                                  # device_count = {'GPU': 1}
                                  )
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

#### Model erstellen

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding

In [11]:
model = Sequential()

model.add(Embedding(200 + 1, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 8)              1608      
_________________________________________________________________
flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 1,641
Trainable params: 1,641
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
history = model.fit(padded_indices, labels, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [13]:
loss, acc = model.evaluate(padded_indices, labels)
print('Accuracy:', acc)

Accuracy: 0.8999999761581421
