In [1]:
!pip install nltk scikit-learn regex numpy pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df = pd.read_csv('Resources/spam_email_nlp.csv')
df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [5]:
df.drop('FILE_NAME',axis=1,inplace=True)

In [6]:
df.CATEGORY.value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [7]:
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [10]:
message_text = "hi, this is a test message.  I am testing, testing, testing"

In [11]:
message_text

'hi, this is a test message.  I am testing, testing, testing'

In [12]:
words_text = message_text.split()

In [13]:
words_text

['hi,',
 'this',
 'is',
 'a',
 'test',
 'message.',
 'I',
 'am',
 'testing,',
 'testing,',
 'testing']

In [14]:
message_lists = ["hi, this is a test message.  I am testing, testing, testing","test, test, test, one, two, three"]

In [15]:
message_lists

['hi, this is a test message.  I am testing, testing, testing',
 'test, test, test, one, two, three']

In [16]:
word_lists = []
text_list = []
for message_text in message_lists:
  word_list = message_text.split()
  for word in word_list:
    text_list.append(word)
  word_lists.append(word_list)
print(word_lists)
print(text_list)

[['hi,', 'this', 'is', 'a', 'test', 'message.', 'I', 'am', 'testing,', 'testing,', 'testing'], ['test,', 'test,', 'test,', 'one,', 'two,', 'three']]
['hi,', 'this', 'is', 'a', 'test', 'message.', 'I', 'am', 'testing,', 'testing,', 'testing', 'test,', 'test,', 'test,', 'one,', 'two,', 'three']


In [17]:
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
ps = nltk.stem.PorterStemmer()
corpus=[]
words = []
for i in message_lists:
    # removing all non-alphanumeric characters
    message_lists=re.sub('[^a-zA-Z0-9]',' ',i) 
    # converting the message to lowercase
    message_lists=message_lists.lower() 
    # spliting the sentence into words for lemmatization                 
    message_lists=message_lists.split()  
    # removing stopwords and lemmatizing            
    message_lists=[lemmatizer.lemmatize(word) for word in message_lists
             if word not in set(stopwords.words('english'))]
    # message_lists=[ps.stem(word) for word in message_lists]
    for word in message_lists:
      words.append(word)

    # Converting the words back into sentences
    message_lists=' '.join(message_lists)    
    # add message to df
    
    # Adding the preprocessed message to the corpus list            
    corpus.append(message_lists)    

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [18]:
corpus

['hi test message testing testing testing', 'test test test one two three']

In [19]:
words

['hi',
 'test',
 'message',
 'testing',
 'testing',
 'testing',
 'test',
 'test',
 'test',
 'one',
 'two',
 'three']

In [20]:
def get_count(words):
    wordCounts = dict()
    for word in words():
        if word in wordCounts:
            wordCounts[word] += 1
        else:
            wordCounts[word] = 1
    
    return wordCounts

In [21]:
from collections import Counter



In [22]:
Counter(words)

Counter({'hi': 1,
         'test': 4,
         'message': 1,
         'testing': 3,
         'one': 1,
         'two': 1,
         'three': 1})

In [23]:
word_dict = Counter(words)

In [24]:
common = word_dict.most_common()
  

In [25]:
common

[('test', 4),
 ('testing', 3),
 ('hi', 1),
 ('message', 1),
 ('one', 1),
 ('two', 1),
 ('three', 1)]

In [26]:
common_words = []
for entry in common:
  c_word = entry[0]
  common_words.append(c_word)

In [27]:
common_words

['test', 'testing', 'hi', 'message', 'one', 'two', 'three']

In [42]:
message_lists_dict = {'message1':['hi test message testing testing testing', 'test test test one two three']}


In [43]:
message_df = pd.DataFrame(message_lists_dict)
message_df

Unnamed: 0,message1
0,hi test message testing testing testing
1,test test test one two three


In [44]:
message_df.reindex(columns=[*message_df.columns.tolist(), *common_words], fill_value=0)
  

Unnamed: 0,message1,test,testing,hi,message,one,two,three
0,hi test message testing testing testing,0,0,0,0,0,0,0
1,test test test one two three,0,0,0,0,0,0,0


In [45]:
message_df.columns

Index(['message1'], dtype='object')

In [46]:
test_df = pd.DataFrame()

In [53]:
!pip install MRJob
from mrjob.job import MRJob


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting MRJob
  Downloading mrjob-0.7.4-py2.py3-none-any.whl (439 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.6/439.6 KB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: MRJob
Successfully installed MRJob-0.7.4


In [126]:
def word_count(text):
  dict_words = {}
  list_words = []

  for word in common_words:
    if word in text:
      dict_words[word] = 1
      # list_words.append(1)
    else:
      dict_words[word] = 0
      # list_words.append(0)
  return(dict_words)
  # print(list_words)
  # pd.DataFrame(dict_words, index = [0])
  # print(word_df)

  
    

  

    


In [71]:
messagetest = "hi test message testing testing testing"

In [72]:
messagetest1 = messagetest.split()

In [74]:
messagetest1

['hi', 'test', 'message', 'testing', 'testing', 'testing']

In [102]:
word_count(messagetest1)

{'test': 1, 'testing': 1, 'hi': 1, 'message': 1, 'one': 0, 'two': 0, 'three': 0}
[1, 1, 1, 1, 0, 0, 0]
   test  testing  hi  message  one  two  three
0     1        1   1        1    0    0      0


In [127]:
word_df = word_count(messagetest1)

In [119]:
print(word_df)

{'test': 1, 'testing': 1, 'hi': 1, 'message': 1, 'one': 0, 'two': 0, 'three': 0}


In [121]:
word_df_df = pd.DataFrame(word_df, index = [0])

In [122]:
word_df_df

Unnamed: 0,test,testing,hi,message,one,two,three
0,1,1,1,1,0,0,0


In [55]:

words_test = []
for i in range(len(message_df)):
  words = message_df['message1'][i]
  mapper(words)
  reducer(words)
 




  # for w in word:
  #   if w in common_words: 
  #     message_df[w] = 1
  #   else:
  #     message_df[w] = 0


  # message_lists=' '.join(message_lists)    
  #   # add message to df
    
  #   # Adding the preprocessed message to the corpus list            
  #   corpus.append(message_lists)  



NameError: ignored

In [51]:
message_df

Unnamed: 0,message1,test,testing,hi,message,one,two,three
0,hi test message testing testing testing,1,0,0,0,1,1,1
1,test test test one two three,1,0,0,0,1,1,1


In [49]:
message_df.columns

Index(['message1', 'test', 'testing', 'hi', 'message', 'one', 'two', 'three'], dtype='object')

In [58]:
words_test

['hi',
 'test',
 'messag',
 'test',
 'test',
 'test',
 'test',
 'test',
 'test',
 'one',
 'two',
 'three']

In [17]:
cv=CountVectorizer(max_features=2500,ngram_range=(1,3))
X=cv.fit_transform(corpus).toarray()

In [18]:
X

array([[1, 1, 1, 1, 1, 1, 0, 0, 0, 4, 1, 1, 0, 0, 2, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1]])

In [33]:
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()
corpus=[]
words = []
for i in range(len(df)):
    # removing all non-alphanumeric characters
    message=re.sub('[^a-zA-Z0-9]',' ',df['MESSAGE'][i]) 
    # converting the message to lowercase
    message=message.lower() 
    # spliting the sentence into words for lemmatization                 
    message=message.split()  
    # removing stopwords and lemmatizing            
    message=[lemmatizer.lemmatize(word) for word in message
             if word not in set(stopwords.words('english'))]
   for word in message:
     words.append(word)
    # Converting the words back into sentences
    message=' '.join(message)    
    # add message to df
    
    # Adding the preprocessed message to the corpus list            
    corpus.append(message)    

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


AttributeError: ignored

In [17]:
print(len(corpus))

5796


In [31]:
print(len(words))

5796


In [19]:
def get_count(text):
    wordCounts = dict()
    for word in text.split():
        if word in wordCounts:
            wordCounts[word] += 1
        else:
            wordCounts[word] = 1
    
    return wordCounts

In [25]:
wordCounts = {}
for text in corpus:
  get_count(text)

In [26]:
wordCounts

{}

In [12]:
df.head(20)

Unnamed: 0,CATEGORY,MESSAGE
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ..."
1,1,ATTENTION: This is a MUST for ALL Computer Use...
2,1,This is a multi-part message in MIME format.\n...
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
4,1,This is the bottom line. If you can GIVE AWAY...
5,1,------=_NextPart_000_00B8_51E06B6A.C8586B31\n\...
6,1,"<STYLE type=""text/css"">\n\n<!--\n\nP{\n\n fon..."
7,1,<HR>\n\n<html>\n\n<head>\n\n <title>Secured I...
8,1,"<table width=""600"" border=""20"" align=""center"" ..."
9,1,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont..."


In [None]:
cv=CountVectorizer(max_features=2500,ngram_range=(1,3))
X=cv.fit_transform(corpus).toarray()
y=df['CATEGORY']

In [None]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 7, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
X_train,X_test,y_train,y_test=train_test_split(
    X,y,test_size=0.20,random_state=1,stratify=y)

In [None]:
tfid=TfidfVectorizer(ngram_range=(1,3),max_features=2500)
X=tfid.fit_transform(corpus).toarray()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1,stratify=y)

In [None]:
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  100
hidden_nodes_layer2 = 200
hidden_nodes_layer3 = 50

In [None]:
import tensorflow as tf

In [None]:
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, activation="relu", input_dim=number_input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 100)               250100    
                                                                 
 dense_5 (Dense)             (None, 200)               20200     
                                                                 
 dense_6 (Dense)             (None, 50)                10050     
                                                                 
 dense_7 (Dense)             (None, 1)                 51        
                                                                 
Total params: 280,401
Trainable params: 280,401
Non-trainable params: 0
_________________________________________________________________


In [None]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
fit_model = nn.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

37/37 - 0s - loss: 0.0724 - accuracy: 0.9905 - 420ms/epoch - 11ms/step
Loss: 0.07240498811006546, Accuracy: 0.990517258644104


In [None]:
nn.save("optimization1.h5")

In [None]:
from tensorflow.keras.models import load_model
model = load_model("optimization1.h5")

In [None]:
print('Predicting...')
message = ["You won 10000 dollars, please provide your account details,So that we can transfer the money"]
message_vector = tfid.transform(message)
indices = tf.placeholder(tf.int64)
shape = tf.placeholder(tf.int64)
values = tf.placeholder(tf.float64)
sparse_tensor = tf.SparseTensor(indices, shape, values)
category = model.predict()
category = model.predict()
print("The message is", "spam" if category == 1 else "not spam")

Predicting...


AttributeError: ignored

In [None]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()

In [None]:
model.fit(x_train,y_train)

In [None]:
train_pred=model.predict(x_train)
test_pred=model.predict(x_test)

In [None]:
print(classification_report(train_pred,y_train))
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      3258
           1       0.89      0.98      0.94      1378

    accuracy                           0.96      4636
   macro avg       0.94      0.97      0.95      4636
weighted avg       0.96      0.96      0.96      4636

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       812
           1       0.91      0.99      0.94       348

    accuracy                           0.96      1160
   macro avg       0.95      0.97      0.96      1160
weighted avg       0.97      0.96      0.97      1160



In [None]:
message = ["You won 10000 dollars, please provide your account details,So that we can transfer the money"]
message_vector = tf.transform(message)
category = model.predict(message_vector)
print("The message is", "spam" if category == 1 else "not spam")

The message is spam
