<a href="https://colab.research.google.com/github/crow-intelligence/gettingStartedWithColab/blob/main/02_Using_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Milestone

## Tensor

### Load the data

Let's load the vectorized training and test data from Google Drive.



In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [3]:
import numpy as np
import string

# Load raw data

texts = []
labels = []
with open("/content/drive/My Drive/Colab Notebooks/hamsms.txt", "r") as infile:
  for l in infile:
    label, text = l.strip().split("\t")
    if label == "ham":
      labels.append(0)
    else:
      labels.append(1)
    text = "".join([ch.lower() for ch in text if ch not in string.punctuation])
    texts.append(text)

labels = np.asarray(labels)

### Preprocess data

In [5]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


tokenizer = Tokenizer(num_words=5000) # voccab size is 9661 -> 5000 should be enough
tokenizer.fit_on_texts(texts)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(texts, labels, random_state=42)

X_train_emb = tokenizer.texts_to_sequences(X_train_raw)
X_test_emb = tokenizer.texts_to_sequences(X_test_raw)

vocab_size = len(tokenizer.word_index) + 1

# print(sum([len(e) for e in texts])/len(texts)) -> 76.25008970218873, 100 should be enough for maxlen

maxlen = 100

X_train = pad_sequences(X_train_emb, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test_emb, padding='post', maxlen=maxlen)


### A very simple Neural Network

In [6]:
import keras
import tensorflow as tf

embedding_dim = 50
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size,embedding_dim))
model.add(keras.layers.GlobalMaxPool1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
model.summary()

# Create a callback that saves the model's weights

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath="/content/drive/My Drive/Colab Notebooks/checkpoints/weights.{epoch:02d}-{val_loss:.2f}.ckpt",
                                                 save_weights_only=True,
                                                 verbose=1)
model.fit(X_train, y_train,
          epochs=100,
          verbose=False,
          validation_data=(X_test, y_test),
          batch_size=16,
          callbacks=[cp_callback])
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
model.save("/content/drive/My Drive/Colab Notebooks/TFmodel")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          483100    
                                                                 
 global_max_pooling1d (Globa  (None, 50)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 16)                816       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 483,933
Trainable params: 483,933
Non-trainable params: 0
_________________________________________________________________

Epoch 1: saving model to /content/drive/My Drive/Colab Notebooks/checkpoints/weights.01-0.32.ckpt

Epoch 2: saving 

In [9]:
from sklearn.metrics import classification_report

y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_pred = [e[0] for e in y_pred]

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1203
           1       0.92      0.93      0.92       191

    accuracy                           0.98      1394
   macro avg       0.95      0.96      0.95      1394
weighted avg       0.98      0.98      0.98      1394

