<a href="https://colab.research.google.com/github/imTheDevil/healthcare_costs_calculator/blob/main/sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing libraries, modules
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# getting data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [32]:
train_dataset = pd.read_csv(train_file_path, sep='\t', header = None)
test_dataset = pd.read_csv(test_file_path, sep='\t', header = None)
train_dataset.tail()

Unnamed: 0,0,1
4174,ham,just woke up. yeesh its late. but i didn't fal...
4175,ham,what do u reckon as need 2 arrange transport i...
4176,spam,free entry into our £250 weekly competition ju...
4177,spam,-pls stop bootydelious (32/f) is inviting you ...
4178,ham,tell my bad character which u dnt lik in me. ...


In [5]:
#Replacing 'ham' with 0 and 'spam' with 1 in both test and train datasets
train_dataset[0] = train_dataset[0].replace("ham", 0)
train_dataset[0] = train_dataset[0].replace("spam", 1)
test_dataset[0] = test_dataset[0].replace("ham", 0)
test_dataset[0] = test_dataset[0].replace("spam", 1)

In [6]:
test_dataset.head()

Unnamed: 0,0,1
0,0,i am in hospital da. . i will return home in e...
1,0,"not much, just some textin'. how bout you?"
2,0,i probably won't eat at all today. i think i'm...
3,0,don‘t give a flying monkeys wot they think and...
4,0,who are you seeing?


In [7]:
#Makes tensor slices from the dataset
train_data = tf.data.Dataset.from_tensor_slices((train_dataset[1], train_dataset[0]))
test_data = tf.data.Dataset.from_tensor_slices((test_dataset[1], test_dataset[0]))

In [8]:
list(train_data.as_numpy_iterator())[:5]

[(b'ahhhh...just woken up!had a bad dream about u tho,so i dont like u right now :) i didnt know anything about comedy night but i guess im up for it.',
  0),
 (b'you can never do nothing', 0),
 (b'now u sound like manky scouse boy steve,like! i is travelling on da bus home.wot has u inmind 4 recreation dis eve?',
  0),
 (b'mum say we wan to go then go... then she can shun bian watch da glass exhibition...',
  0),
 (b'never y lei... i v lazy... got wat? dat day \xc3\xbc send me da url cant work one...',
  0)]

In [9]:
#Splitting strings to tokens
tokenizer = tfds.deprecated.text.Tokenizer()

#Making a vocabulary set
vocabulary_set = set()
for text_tensor, label in train_data.concatenate(test_data):
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

8741

In [10]:
#Creating an encoder object
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

In [11]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


train_dataset_encoded = train_data.map(encode_map_fn)
test_dataset_encoded = test_data.map(encode_map_fn)

In [31]:
#Example for encoding
for train_example, train_label in train_dataset_encoded.take(2):
  print('Encoded text:', train_example[:10].numpy())
  print('Label:', train_label.numpy())

Encoded text: [8106 3912 6544 5723 7501 1667 7507 8408 5429 7352]
Label: 0
Encoded text: [  77 4294 2498 8520 2146]
Label: 0


In [13]:
BUFFER_SIZE = 1000

train_batches = (
    train_dataset_encoded
    .shuffle(BUFFER_SIZE)
    .padded_batch(32))

test_batches = (
    test_dataset_encoded
    .padded_batch(32))

In [14]:
model = keras.Sequential([
  keras.layers.Embedding(encoder.vocab_size, 16),
  keras.layers.GlobalAveragePooling1D(),
  keras.layers.Dense(1, activation='sigmoid')])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          139888    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 139,905
Trainable params: 139,905
Non-trainable params: 0
_________________________________________________________________


In [15]:
# train model
history = model.fit(train_batches,
                    epochs=10,
                    validation_data=test_batches,
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
# evaluate model
loss, accuracy = model.evaluate(test_batches)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.12099482864141464
Accuracy:  0.954741358757019


In [29]:
# function to predict messages based on model
# returns [x, y] where y being 'ham' or 'spam' and x being their respective probabilities
def predict_message(pred_text):
  encoded_pred_text = encoder.encode(pred_text)
  encoded_pred_text = tf.cast(encoded_pred_text, tf.float32)
  prediction = model.predict(tf.expand_dims(encoded_pred_text, tf.constant(0))).tolist()
  prediction = prediction[0]
  result = []
  if prediction[0] < .5:
    result.append(1-prediction[0])
    result.append("ham")
  else:
    result.append(prediction[0])
    result.append("spam")
  return result

pred_text = "sms WIN to win 1000 now. offer limited time only"

prediction = predict_message(pred_text)
print(prediction)

[0.9969822764396667, 'spam']
