In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
data=pd.read_csv('/content/drive/MyDrive/spam.csv')
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
data.isna().sum()

Category    0
Message     0
dtype: int64

In [4]:
data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
def apply_on_data(function):
  data['Message']=data['Message'].apply(function)
  return data['Message'][1]

In [6]:
import re
def remove_html(data):
  pattern=re.compile('<.*?>')
  return pattern.sub(r'',data)

In [7]:
apply_on_data(remove_html)

'Ok lar... Joking wif u oni...'

In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def stopwords_removal(text):
  new_text=[]
  for w in text.split():
    if w in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(w)
  return ''.join(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
stopwords_removal(data['Message'][1])

'Ok lar... Joking wif u oni...'

In [10]:
apply_on_data(stopwords_removal)

'Ok lar... Joking wif u oni...'

In [11]:
import string
def remove_punctuations(text):
  for w in string.punctuation:
    text.replace(w,'')
  return text

In [12]:
remove_punctuations(data['Message'][1])

'Ok lar... Joking wif u oni...'

In [13]:
apply_on_data(remove_punctuations)

'Ok lar... Joking wif u oni...'

In [14]:
data['Category']=data['Category'].map({'ham':1,'spam':0})

In [15]:
data['Message']=data['Message'].astype(str)

In [16]:
X=data['Message']
y=data['Category']

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.2,random_state=42)
len(X_train),len(X_test),len(y_train),len(y_test)

(4457, 1115, 4457, 1115)

In [18]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int64

In [19]:
tokenizer=Tokenizer(num_words=4457,oov_token='nothing')
tokenizer.fit_on_texts(X_train)

In [20]:
sequence=tokenizer.texts_to_sequences(X_train)
padded=pad_sequences(sequence,padding='post',maxlen=100)

In [21]:
padded_n=np.array(padded)
padded_n

array([[ 505, 3807, 3808, ...,    0,    0,    0],
       [ 255,  811, 1271, ...,    0,    0,    0],
       [ 138, 3809,    8, ...,    0,    0,    0],
       ...,
       [ 461,    6,   44, ...,    0,    0,    0],
       [  87,   78,   18, ...,    0,    0,    0],
       [1021,  809,   91, ...,    0,    0,    0]], dtype=int32)

In [22]:
model=keras.Sequential([
    keras.layers.Embedding(4457,24,input_length=100),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(16,activation='relu'),
    keras.layers.Dense(1,activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 24)           106968    
                                                                 
 global_average_pooling1d (  (None, 24)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 16)                400       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 107385 (419.47 KB)
Trainable params: 107385 (419.47 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics='accuracy'
)

In [24]:
model.fit(padded_n,y_train,epochs=30,verbose=2)

Epoch 1/30
140/140 - 2s - loss: 0.5384 - accuracy: 0.8470 - 2s/epoch - 16ms/step
Epoch 2/30
140/140 - 1s - loss: 0.3526 - accuracy: 0.8658 - 641ms/epoch - 5ms/step
Epoch 3/30
140/140 - 1s - loss: 0.3199 - accuracy: 0.8658 - 628ms/epoch - 4ms/step
Epoch 4/30
140/140 - 1s - loss: 0.2510 - accuracy: 0.8777 - 962ms/epoch - 7ms/step
Epoch 5/30
140/140 - 1s - loss: 0.1324 - accuracy: 0.9632 - 723ms/epoch - 5ms/step
Epoch 6/30
140/140 - 1s - loss: 0.0730 - accuracy: 0.9803 - 536ms/epoch - 4ms/step
Epoch 7/30
140/140 - 0s - loss: 0.0535 - accuracy: 0.9838 - 494ms/epoch - 4ms/step
Epoch 8/30
140/140 - 1s - loss: 0.0421 - accuracy: 0.9874 - 503ms/epoch - 4ms/step
Epoch 9/30
140/140 - 1s - loss: 0.0357 - accuracy: 0.9906 - 505ms/epoch - 4ms/step
Epoch 10/30
140/140 - 1s - loss: 0.0309 - accuracy: 0.9897 - 510ms/epoch - 4ms/step
Epoch 11/30
140/140 - 1s - loss: 0.0268 - accuracy: 0.9928 - 516ms/epoch - 4ms/step
Epoch 12/30
140/140 - 0s - loss: 0.0234 - accuracy: 0.9935 - 456ms/epoch - 3ms/step
Epo

<keras.src.callbacks.History at 0x7ba68f3aa320>

In [25]:
print(data.head())

   Category                                            Message
0         1  Go until jurong point, crazy.. Available only ...
1         1                      Ok lar... Joking wif u oni...
2         0  Free entry in 2 a wkly comp to win FA Cup fina...
3         1  U dun say so early hor... U c already then say...
4         1  Nah I don't think he goes to usf, he lives aro...


In [35]:
test=['Even my brother is not like to speak with me']
sequence=tokenizer.texts_to_sequences(test)
padded=pad_sequences(sequence,padding='post',maxlen=100,truncating='post')
model.predict(padded)



array([[0.9997436]], dtype=float32)

In [37]:
test=['SIX chances to win CASH! From 100 to 20,000 po']
sequence=tokenizer.texts_to_sequences(test)
padded=pad_sequences(sequence,padding='post',maxlen=100,truncating='post')
model.predict(padded)



array([[0.02316833]], dtype=float32)