In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/spam-email/spam.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Scenario
Company dislikeSpam approaches us to improve their customer service by allowing their customers to reach out to them with text messages. However, since they are afraid that they will receive a lot of spam messages, they would like to recognize and filter them automatically. Our task is to build a first prototype and create a proof of concept.

![](http://www.dww.com/sites/default/files/styles/landscape_ri/public/shutterstock_172545959.jpg?itok=mPrJKywC)
Image source: http://www.dww.com/sites/default/files/styles/landscape_ri/public/shutterstock_172545959.jpg?itok=mPrJKywC

# Understand Data

In [2]:
#Import regured modules 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# suppress scientific notation
np.set_printoptions(suppress=True) 
pd.options.display.float_format = '{:.2f}'.format

# Read Data
df = pd.read_csv('/kaggle/input/spam-email/spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Note:
What is ham and spam?
"Ham" is e-mail that is not Spam. In other words, "non-spam", or "good mail". It should be considered a shorter, snappier synonym for "non-spam". Its usage is particularly common among anti-spam software developers, and not widely known elsewhere; in general it is probably better to use the term "non-spam", instead.

[source](https://cwiki.apache.org/confluence/display/spamassassin/Ham#:~:text=%22Ham%22%20is%20e%2Dmail,non%2Dspam%22%2C%20instead.)

In [3]:
# DataFrame shape - Number of rows and columns
df.shape

(5572, 2)

In [4]:
# Take a look at two example of not-spam and spam message
print('Example of not-spam message: ', df.loc[0, 'Message'])
print('Example of spam message: ', df.loc[2, 'Message'])

Example of not-spam message:  Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Example of spam message:  Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


# Data Cleaning and Prepration
In this step, first, the text data is cleaned to correct errors and give it more structure for analysis (data cleaning and preparation). Then[ NLP methods](https://www.ibm.com/topics/natural-language-processing#:~:text=the%20next%20step-,What%20is%20natural%20language%20processing%3F,same%20way%20human%20beings%20can.) are used to get certain parts of the text that reflect the categories as best as possible. These extracted features can then be used with classification models. In this project, [Support Vector Machine](https://towardsdatascience.com/support-vector-machine-introduction-to-machine-learning-algorithms-934a444fca47) is going to be used as classifer.

In [5]:
# Import required modules for playing with strings
import string
import re

# NLP modules
import spacy
import nltk

In [6]:
#  Load spacy's English statistical model to enable NLP tasks for an English corpus
from spacy.lang.en.examples import sentences
nlp = spacy.load('en_core_web_sm')
print(type(nlp))

<class 'spacy.lang.en.English'>


We see that nlp has the data type Language. This means that it contains all the components necessary to process English-language text. Fpr more details please see [here](https://spacy.io/usage/models).

### Cleaning and processing techniques are:
[Useful links:](https://www.turing.com/kb/natural-language-processing-function-in-ai)
#### tokenization
In order to properly analyze text data, machine learning models should be able to recognize structures in the text, such as individual words and their parts of speech. We achieve this through tokenization. The corpus is broken down into meaningful linguistic units, such as words or sentences, and saved as a list. The elements of this list are called tokens. 

#### Lemmatization
Lemmatization means that words are reduced to their basic form, also known as lemma. For grammatical reasons, different forms of the same word can be used in one text, e.g. make, makes, making or maker. Python would consider these variations of the word make as separate words, even though their meaning is the same. Through lemmatization, we bring all variations of make to this basic form.
#### Stop word removal
Stop word removal is used to remove common words from a text. Stop words are usually articles ("the" and "a"), pronouns like "I" and "you" (already removed in the previous step), or common verbs ("be", "can"). These words are common in most English language texts. Removing these words would reduce the amount of data that needs to be analyzed while allowing machine learning algorithms to place more weight on tokens that give a text its real meaning.

In [7]:
# Import stopwords from nltk
from nltk.corpus import stopwords
# Save stopwords as a set
stopWords = set(stopwords.words('english'))
type(stopWords)
print(stopWords)

{'now', "didn't", 've', 'these', 'mustn', 'each', 'such', 'further', 'our', 'y', "hadn't", 'in', 'some', 'you', "you've", 're', 'the', 'before', 'does', "aren't", 'wasn', 'him', 'themselves', 'no', "don't", 'didn', 'through', "shouldn't", 'she', 'itself', 'during', 'weren', 'did', "wouldn't", 'when', 'am', 'at', 'their', 'haven', 'all', 'hers', 'own', 'as', "hasn't", 'm', 'for', 'has', 'then', 'hadn', "you're", "you'd", 'against', "haven't", 'both', 'above', 'he', "mightn't", 'after', 'where', 'below', "mustn't", 'was', 'any', 'don', 'is', 'down', 'same', 'yourself', 'few', 'than', 'into', 'about', 'if', 'me', 'or', 'theirs', 'o', 'herself', 'yours', 'couldn', 'shouldn', "needn't", 'his', "wasn't", 'ourselves', 'have', 'ours', 'once', 'will', 'that', 'be', 'until', 'wouldn', 'having', "it's", 'doing', 'up', 'more', 'here', 'my', 'so', 'do', "isn't", 'had', 'under', 'ain', "doesn't", 'shan', 'those', 'there', 'not', 'a', "should've", 'them', 'an', 'your', "weren't", 'hasn', "that'll", '

#### Removal of punctuation marks
Just like stop word removal, punctuation removal involves removing punctuation marks and symbols that do not contribute to the meaning of the text. We can use punctuation from the string module. This is a string consisting of punctuation marks and symbols. We can remove these from our text like the stop words.

In [8]:
# List of punctuation marks
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Steps explained above are all packed into function message_cleaner is going to be used for cleaninf and processing of our text data. 

In [9]:
# `Function message_cleaner
def message_cleaner(sentence):
    """
    Function message_cleaner clean the text using typical Natural Language Processing
    (NLP) steps. 
    Steps include: Lemmatization, removing stop words, removing punctuations  
    Args:
        sentence (str): The uncleaned text. 
    Returns:
        str: The cleaned text.
    """
    # Create the Doc object named `text` from `sentence` using `nlp()`
    text = nlp(sentence)    
    # Lemmatization - remove the lemmas -PRON-     
    text = [ token.lemma_ for token in text if token.lemma_ != "-PRON-"]
    # Remove stop words
    text = [ token for token in text if token not in stopWords ]
    # Remove punctuations
    text = [ token for token in text if token not in punctuations]
    # Use the .join() method on text to convert string
    text = " ".join(text)
    # Use re.sub() to substitute multiple spaces or dots`[\.\s]+` to single space `' '`
    text  = re.sub('[\.\s]+', ' ', text)
    
    # Return the cleaned text
    return text

Now it's time to apply our function to all messages in df. Apply my_series.apply() to the 'msg' column of df and pass it the text_cleaner function to apply it to all text messages. Store the results in a new column in df called 'msg_clean'.

In [10]:
# Apply function message_cleaner to all text messages and store the results in new column as message_cleaned
df.loc[:,'message_cleaned'] = df.loc[:,'Message'].apply(message_cleaner)

In [11]:
# Display the results
df

Unnamed: 0,Category,Message,message_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah I think go usf live around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time try 2 contact u u win £ 750 Pound pri...
5568,ham,Will ü b going to esplanade fr home?,ü b go esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestion
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching I act like I would interested buy...


In [12]:
# 
