In [9]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

plt.style.use(style="seaborn")
%matplotlib inline

In [10]:
df = pd.read_csv("https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv")

In [11]:
import re


def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r"", text)

def remove_html(text):
    html= re.compile(r"<.*?>")
    return html.sub(r"", text)

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [None]:
# import string

# 'This, is.%^&!@sx..¯ƒ˳œœœ˙¯ß¯¯ßð A test!'.translate(str.maketrans('', '', string.punctuation))

In [None]:
# sample_str = "Helßßðð´≠œlo %% Wo$#rl@d"

# # using isalnum()
# print("".join(k for k in sample_str if k.isalnum()))

In [13]:
df['text'] = df['text'].map(lambda x: remove_URL(x))
df['text'] = df.text.map(lambda x: remove_html(x))
df['text'] = df.text.map(lambda x: remove_emojis(x))

In [14]:
from nltk.corpus import stopwords
stop = set(stopwords.words("english"))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

In [15]:
df['text'] = df['text'].map(remove_stopwords)

In [16]:
df.text

0       tv future hands viewers home theatre systems p...
1       worldcom boss left books alone former worldcom...
2       tigers wary farrell gamble leicester say rushe...
3       yeading face newcastle fa cup premiership side...
4       ocean twelve raids box office ocean twelve cri...
                              ...                        
2220    cars pull us retail figures us retail sales fe...
2221    kilroy unveils immigration policy ex-chatshow ...
2222    rem announce new glasgow concert us band rem a...
2223    political squabbles snowball become commonplac...
2224    souness delight euro progress boss graeme soun...
Name: text, Length: 2225, dtype: object

In [17]:
from collections import Counter
# count unique words

def counter_word(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count

In [18]:
sentences = df.text

counter = counter_word(sentences)

len(counter)

43620

In [20]:
counter

Counter({'tv': 471,
         'future': 290,
         'hands': 77,
         'viewers': 82,
         'home': 601,
         'theatre': 81,
         'systems': 101,
         'plasma': 14,
         'high-definition': 74,
         'tvs': 17,
         'digital': 404,
         'video': 332,
         'recorders': 20,
         'moving': 75,
         'living': 78,
         'room': 57,
         'way': 673,
         'people': 1970,
         'watch': 100,
         'radically': 10,
         'different': 249,
         'five': 470,
         'years': 770,
         'time.': 164,
         'according': 422,
         'expert': 28,
         'panel': 71,
         'gathered': 38,
         'annual': 168,
         'consumer': 212,
         'electronics': 80,
         'show': 520,
         'las': 31,
         'vegas': 25,
         'discuss': 59,
         'new': 1957,
         'technologies': 110,
         'impact': 118,
         'one': 1705,
         'favourite': 102,
         'pastimes.': 1,
         'us': 1786,

In [66]:
num_words = len(counter)
print(num_words)

# max number of words in a sequence
max_length = 20

43620


### Train-Test Split

In [21]:
train_size = int(df.shape[0] * 0.8)

train_sentences = df.text[: train_size]
train_labels = df.category[: train_size]

test_sentences = df.text[train_size:]
test_labels = df.category[train_size:]


In [22]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_sentences)

In [23]:
word_index = tokenizer.word_index

In [24]:
word_index

{'said': 1,
 'mr': 2,
 'would': 3,
 'year': 4,
 'also': 5,
 'people': 6,
 'new': 7,
 'us': 8,
 'one': 9,
 'could': 10,
 'last': 11,
 'first': 12,
 'time': 13,
 'two': 14,
 'government': 15,
 'world': 16,
 'uk': 17,
 'best': 18,
 'years': 19,
 'make': 20,
 'film': 21,
 'told': 22,
 'made': 23,
 'get': 24,
 'music': 25,
 'game': 26,
 'like': 27,
 'back': 28,
 'many': 29,
 '000': 30,
 'labour': 31,
 'three': 32,
 'well': 33,
 '1': 34,
 'next': 35,
 'bbc': 36,
 'take': 37,
 'set': 38,
 'number': 39,
 'added': 40,
 'way': 41,
 'market': 42,
 '2': 43,
 'company': 44,
 'may': 45,
 'says': 46,
 'election': 47,
 'home': 48,
 'party': 49,
 'good': 50,
 'going': 51,
 'much': 52,
 'work': 53,
 '2004': 54,
 'still': 55,
 'win': 56,
 'show': 57,
 'think': 58,
 'games': 59,
 'go': 60,
 'top': 61,
 'second': 62,
 'million': 63,
 '6': 64,
 'england': 65,
 'firm': 66,
 'since': 67,
 'week': 68,
 'say': 69,
 'play': 70,
 'part': 71,
 'public': 72,
 'use': 73,
 'blair': 74,
 '3': 75,
 'want': 76,
 'minist

In [25]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_sequences[0]

[90,
 159,
 1140,
 1105,
 48,
 978,
 754,
 6567,
 88,
 1301,
 4283,
 128,
 174,
 3651,
 1212,
 1193,
 1572,
 41,
 6,
 892,
 90,
 6095,
 333,
 84,
 19,
 13,
 129,
 3260,
 1213,
 2419,
 568,
 450,
 1373,
 57,
 3376,
 3520,
 1656,
 7,
 920,
 729,
 9,
 842,
 17495,
 8,
 596,
 1573,
 1106,
 394,
 1936,
 1105,
 730,
 48,
 536,
 1395,
 2005,
 1618,
 133,
 248,
 112,
 2353,
 794,
 4975,
 979,
 582,
 9,
 3953,
 4284,
 920,
 2561,
 128,
 343,
 174,
 3651,
 7819,
 5310,
 38,
 61,
 2865,
 27,
 8,
 4717,
 17,
 1302,
 135,
 415,
 6,
 142,
 1420,
 70,
 4496,
 435,
 4976,
 90,
 1106,
 76,
 6096,
 81,
 2006,
 52,
 7820,
 90,
 5,
 1007,
 615,
 88,
 1301,
 90,
 1958,
 130,
 136,
 419,
 8,
 2866,
 37,
 151,
 1232,
 88,
 1301,
 4718,
 6,
 435,
 4976,
 3151,
 5,
 2490,
 8696,
 430,
 1125,
 7821,
 1421,
 569,
 1313,
 1898,
 13446,
 765,
 8,
 536,
 1395,
 2005,
 133,
 2063,
 399,
 843,
 1959,
 1596,
 33,
 1713,
 2867,
 5311,
 5312,
 2420,
 243,
 8,
 2623,
 81,
 731,
 5,
 1171,
 1194,
 151,
 719,
 589,
 9770,


In [27]:
import tensorflow as tf

In [30]:
from keras_preprocessing.sequence import pad_sequences

train_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding="post", truncating= "post"
)

In [31]:
train_padded[0]

array([  90,  159, 1140, 1105,   48,  978,  754, 6567,   88, 1301, 4283,
        128,  174, 3651, 1212, 1193, 1572,   41,    6,  892], dtype=int32)

In [32]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(
    test_sequences, maxlen=max_length, padding="post", truncating="post"
)

In [34]:
print(df.text[0])
print(train_sequences[0])

tv future hands viewers home theatre systems plasma high-definition tvs digital video recorders moving living room way people watch tv radically different five years time. according expert panel gathered annual consumer electronics show las vegas discuss new technologies impact one favourite pastimes. us leading trend programmes content delivered viewers via home networks cable satellite telecoms companies broadband service providers front rooms portable devices. one talked-about technologies ces digital personal video recorders (dvr pvr). set-top boxes like us tivo uk sky+ system allow people record store play pause forward wind tv programmes want. essentially technology allows much personalised tv. also built-in high-definition tv sets big business japan us slower take europe lack high-definition programming. people forward wind adverts also forget abiding network channel schedules putting together a-la-carte entertainment. us networks cable satellite companies worried means terms ad

In [35]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [36]:
def decode(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [37]:
decode(train_sequences[0])

'tv future hands viewers home theatre systems plasma high definition tvs digital video recorders moving living room way people watch tv radically different five years time according expert panel gathered annual consumer electronics show las vegas discuss new technologies impact one favourite pastimes us leading trend programmes content delivered viewers via home networks cable satellite telecoms companies broadband service providers front rooms portable devices one talked about technologies ces digital personal video recorders dvr pvr set top boxes like us tivo uk sky system allow people record store play pause forward wind tv programmes want essentially technology allows much personalised tv also built in high definition tv sets big business japan us slower take europe lack high definition programming people forward wind adverts also forget abiding network channel schedules putting together a la carte entertainment us networks cable satellite companies worried means terms advertising 

In [39]:
print(f"shape of train: {train_padded.shape}")
print(f"shape of test: {test_padded.shape}")

shape of train: (1780, 20)
shape of test: (445, 20)


In [49]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.initializers import Constant
from keras.optimizers import Adam

In [74]:
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=32, input_length=max_length))
model.add(Bidirectional(LSTM(64, dropout=0.1)))
model.add(Dense(6, activation="relu"))
model.add(Dense(6, activation="softmax"))

optimizer = Adam(learning_rate=3e-4)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [75]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 20, 32)            1395840   
                                                                 
 bidirectional_6 (Bidirectio  (None, 128)              49664     
 nal)                                                            
                                                                 
 dense_14 (Dense)            (None, 6)                 774       
                                                                 
 dense_15 (Dense)            (None, 6)                 42        
                                                                 
Total params: 1,446,320
Trainable params: 1,446,320
Non-trainable params: 0
_________________________________________________________________


In [76]:
history = model.fit(
    train_padded, train_labels, epochs=20, validation_data=(test_padded, test_labels)
)

Epoch 1/20


2023-03-04 20:23:23.867569: W tensorflow/core/framework/op_kernel.cc:1722] OP_REQUIRES failed at cast_op.cc:121 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node 'Cast_1' defined at (most recent call last):
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/traitlets/config/application.py", line 978, in launch_instance
      app.start()
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 595, in run_forever
      self._run_once()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 1881, in _run_once
      handle._run()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2885, in run_cell
      result = self._run_cell(
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3139, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3318, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/xf/_6m_tb052fn5p84tlt773rqr0000gn/T/ipykernel_1877/1494721296.py", line 1, in <module>
      history = model.fit(
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/engine/training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/engine/training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/engine/training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/engine/training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/engine/training.py", line 894, in train_step
      return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/engine/training.py", line 987, in compute_metrics
      self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 501, in update_state
      metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/utils/metrics_utils.py", line 70, in decorated
      update_op = update_state_fn(*args, **kwargs)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 140, in update_state_fn
      return ag_update_state(*args, **kwargs)
    File "/Users/harendrakumar/.local/share/virtualenvs/ETL-6mpoCg48/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 637, in update_state
      y_true = tf.cast(y_true, self._dtype)
Node: 'Cast_1'
Cast string to float is not supported
	 [[{{node Cast_1}}]] [Op:__inference_train_function_46801]