### Advance NLP

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth',150)
msg = pd.read_csv('../Data/Spam.csv', encoding='latin-1')
msg.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


In [2]:
msg.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace = True)
msg.rename(columns={'v1':'Label', 'v2':'Text'}, inplace = True)
msg.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [3]:
print(msg['Label'].value_counts())

ham     4825
spam     747
Name: Label, dtype: int64


In [4]:
print(msg.isna().sum())

Label    0
Text     0
dtype: int64


In [5]:
msg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   Text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


`Preprocessing` Text Data

1. `Remove` Punctuation

2. Tokenization

3. `Remove` Stopwords

In [6]:
import string
print(f'Punctuations : {string.punctuation}')

Punctuations : !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Remove `Punctuation`

In [7]:
import re

def remove_punctuation(text):
    no_punctuation = "".join([char for char in text if char not in string.punctuation])
    return no_punctuation

msg['No Punctuation'] = msg['Text'].apply(lambda x : remove_punctuation(x))
msg.head()

Unnamed: 0,Label,Text,No Punctuation
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


Create `Tokens`

In [8]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

msg['Tokens'] = msg['Text'].apply(lambda x : tokenize(x.lower()))
msg.head()

Unnamed: 0,Label,Text,No Punctuation,Tokens
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, cine, there, got, amore, wat, ]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni, ]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, question, std, txt, r..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say, ]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, don, t, think, he, goes, to, usf, he, lives, around, here, though]"


Remove `Stopwords`

In [9]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokens):
    no_stopwords = [word for word in tokens if word not in stopwords]
    return no_stopwords

msg['No Stopwords'] = msg['Tokens'].apply(lambda x : remove_stopwords(x))
msg.head()

Unnamed: 0,Label,Text,No Punctuation,Tokens,No Stopwords
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, cine, there, got, amore, wat, ]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat, ]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni, ]","[ok, lar, joking, wif, u, oni, ]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, question, std, txt, r...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, question, std, txt, rate, c, apply, 0845..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say, ]","[u, dun, say, early, hor, u, c, already, say, ]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, don, t, think, he, goes, to, usf, he, lives, around, here, though]","[nah, think, goes, usf, lives, around, though]"


Creating Single Function to `Clean Text`

In [10]:
def clean_text(text):
    no_punctuation = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', no_punctuation)
    no_stopwords = [word for word in tokens if word not in stopwords]
    return no_stopwords

Apply `TfidfVectorizer`

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer=clean_text)
tfidf_vector = tfidf.fit_transform(msg['Text'])
print(tfidf_vector.shape)

(5572, 9395)


In [12]:
X = pd.DataFrame(tfidf_vector.toarray())
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9385,9386,9387,9388,9389,9390,9391,9392,9393,9394
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
y = msg['Label']
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: Label, dtype: object

Splitting the Data Set into `Train` Set and `Test` Set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Using `Random Forest Classifier`

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score

rfc = RandomForestClassifier()
model = rfc.fit(X_train, y_train)

Making `Prediction`

In [16]:
y_pred = rfc.predict(X_test)

`Evaluate` Model Prediction using `Precision` and `Recall`

In [17]:
print(f'Precision : {precision_score(y_test, y_pred, pos_label="spam")*100:.2f}%')
print(f'Recall    : {recall_score(y_test, y_pred, pos_label="spam")*100:.2f}%')
print(f'Accuracy  : {accuracy_score(y_test, y_pred)*100:.2f}%')

Precision : 100.00%
Recall    : 79.33%
Accuracy  : 97.22%
