In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline    
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score, f1_score

In [2]:
data=pd.read_csv("D:/project/.venv/Internship/spam_mail/spam.csv", encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
data['v1'].unique()

array(['ham', 'spam'], dtype=object)

In [4]:
data.rename(columns={'v1':'target','v2':'Message'},inplace=True)
data.head()

Unnamed: 0,target,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
data_txt = data.drop(columns = ["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis=1)

In [6]:
data_txt.head()

Unnamed: 0,target,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Lower Case

In [7]:
data_txt['Message']=data_txt['Message'].str.lower()
data_txt.head()

Unnamed: 0,target,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [8]:
data_txt.shape

(5572, 2)

# Data pre processing
## Duplicate Data

In [9]:
data_txt.duplicated().sum()

404

In [10]:
data_txt.drop_duplicates(inplace=True)

# Remove white space

In [11]:
def remove_whitespace(text):
  return " ".join(text.split())

data_txt['Message']=data_txt['Message'].apply(remove_whitespace)

# Tokenization

In [12]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
data_txt['Message']=data_txt['Message'].apply(lambda X: word_tokenize(X))
data_txt.head()

Unnamed: 0,target,Message
0,ham,"[go, until, jurong, point, ,, crazy, .., avail..."
1,ham,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,"[u, dun, say, so, early, hor, ..., u, c, alrea..."
4,ham,"[nah, i, do, n't, think, he, goes, to, usf, ,,..."


In [14]:
data_txt['Message'][0]

['go',
 'until',
 'jurong',
 'point',
 ',',
 'crazy',
 '..',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 '...',
 'cine',
 'there',
 'got',
 'amore',
 'wat',
 '...']

# Remove Stopward

In [15]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
en_stopwords = stopwords.words('english')
def remove_stopwords(text):
  result = []
  for token in text:
    if token not in en_stopwords:
      result.append(token)
  return result

In [17]:
data_txt['Message'] = data_txt['Message'].apply(remove_stopwords)
data_txt['Message']

0       [go, jurong, point, ,, crazy, .., available, b...
1                [ok, lar, ..., joking, wif, u, oni, ...]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3       [u, dun, say, early, hor, ..., u, c, already, ...
4       [nah, n't, think, goes, usf, ,, lives, around,...
                              ...                        
5567    [2nd, time, tried, 2, contact, u., u, å£750, p...
5568               [ì_, b, going, esplanade, fr, home, ?]
5569           [pity, ,, *, mood, ., ..., suggestions, ?]
5570    [guy, bitching, acted, like, 'd, interested, b...
5571                                [rofl, ., true, name]
Name: Message, Length: 5168, dtype: object

# Remove puncution

In [18]:
from nltk.tokenize import RegexpTokenizer
def remove_punct(text):
  tokenizer = RegexpTokenizer(r"\w+")
  lst=tokenizer.tokenize(' '.join(text))
  return lst

In [19]:
data_txt['Message'] = data_txt['Message'].apply(remove_punct)
data_txt['Message']

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, n, t, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, u, å, 750, p...
5568                  [ì_, b, going, esplanade, fr, home]
5569                            [pity, mood, suggestions]
5570    [guy, bitching, acted, like, d, interested, bu...
5571                                   [rofl, true, name]
Name: Message, Length: 5168, dtype: object

In [None]:
from nltk import FreqDist


def frequent_words(df):
    lst=[]
    for text in df.values:
        lst+=text[0]
    fdist=FreqDist(lst)
    return fdist.most_common(20)
frequent_words(data_txt)


[('a', 5168), ('m', 5168), ('h', 4515), ('s', 653), ('p', 653)]

In [21]:
freq_words = frequent_words(data_txt)


lst = []
for a,b in freq_words:
  lst.append(b)


def remove_freq_words(text):
  result=[]
  for item in text:
    if item not in lst:
      result.append(item)
  return result

data_txt['Message']=data_txt['Message'].apply(remove_freq_words)

# Removal of tags

In [22]:
import re
def remove_tag(text):


    text=' '.join(text)
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [23]:
data_txt['Message']=data_txt['Message'].apply(remove_tag)
data_txt.head()

Unnamed: 0,target,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah n t think goes usf lives around though


# Removal of tags

In [24]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [25]:
data_txt['Message']=data_txt['Message'].apply(remove_urls)
data_txt.head()

Unnamed: 0,target,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah n t think goes usf lives around though


# chat word removal

In [26]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LMK=Let Me Know
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [27]:
chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [28]:
data_txt['Message']=data_txt['Message'].apply(chat_words_conversion)
data_txt.head()

Unnamed: 0,target,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif You oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,You dun say early hor You c already say
4,ham,nah n t think goes usf lives around though


# Stemming & Lemmatization 

In [29]:
import nltk
nltk.download('omw-1.4')
from nltk.stem import PorterStemmer

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [30]:
def stemming(text):
  porter = PorterStemmer()
  result=[]
  for word in text:
    result.append(porter.stem(word))
  return result

In [31]:
data_txt['Message']=data_txt['Message'].apply(stemming)
data_txt.head()

Unnamed: 0,target,Message
0,ham,"[g, o, , j, u, r, o, n, g, , p, o, i, n, t, ..."
1,ham,"[o, k, , l, a, r, , j, o, k, i, n, g, , w, ..."
2,spam,"[f, r, e, e, , e, n, t, r, y, , 2, , w, k, ..."
3,ham,"[y, o, u, , d, u, n, , s, a, y, , e, a, r, ..."
4,ham,"[n, a, h, , n, , t, , t, h, i, n, k, , g, ..."


In [32]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
def lemmatization(text):
  result=[]
  wordnet = WordNetLemmatizer()
  for token,tag in pos_tag(text):
    pos=tag[0].lower()
    if pos not in ['a', 'r', 'n', 'v']:
      pos='n'
    result.append(wordnet.lemmatize(token,pos))
  return result


In [34]:
data_txt['Message']=data_txt['Message'].apply(lemmatization)
data_txt.head()

Unnamed: 0,target,Message
0,ham,"[g, o, , j, u, r, o, n, g, , p, o, i, n, t, ..."
1,ham,"[o, k, , l, a, r, , j, o, k, i, n, g, , w, ..."
2,spam,"[f, r, e, e, , e, n, t, r, y, , 2, , w, k, ..."
3,ham,"[y, o, u, , d, u, n, , s, a, y, , e, a, r, ..."
4,ham,"[n, a, h, , n, , t, , t, h, i, n, k, , g, ..."


In [35]:
X=data_txt['Message']
data_txt['target']=data_txt['target'].map({'ham':1,'spam':0})
Y=data_txt['target']

# Model training

In [36]:
# Splitting data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [37]:
X_train_text = [''.join(map(str, vec)) for vec in X_train]
X_test_text = [''.join(map(str, vec)) for vec in X_test]


In [38]:
X_train_text 

['aight time want come',
 'mostly like',
 'free video camera phones half price line rental 12 mths 500 cross ntwk mins 100 txts call mobileupd8 08001950382 call2optout 674',
 'argh 3g spotty anyway thing remember research province sterling problem free places looked',
 'per request maangalyam alaipayuthe set callertune callers press 9 copy friends callertune',
 'please n t text anymore nothing else say',
 'know shall speak lt gt minutes',
 'urgent mobile awarded å 2 000 bonus caller prize 1 08 03 2nd attempt contact call 0871 4719 523 box95qu bt national rate',
 'wif family booking tour package',
 'reach ten morning',
 'reading gud habit nan bari hudgi yorge pataistha ertini kano',
 'sometimes heart remembrs someone much forgets someone soon bcoz heart like everyone liked ones remembered everytime bslvyl',
 'cant keep talking people sure pay agree price pls tell want really buy much willing pay',
 'guys',
 'cocksuckers makes feel better ipads worthless garbage novelty items feel bad ev

In [39]:
Y_test.shape

(1034,)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline_RF=Pipeline(steps=[('tv',TfidfVectorizer()),('RF',RandomForestClassifier(n_estimators=100))])
pipeline_RF.fit(X_train_text,Y_train)
acc=pipeline_RF.score(X_test_text,Y_test)
y_pred=pipeline_RF.predict(X_test_text)
cv=cross_val_score(pipeline_RF,X_train_text ,Y_train,scoring='accuracy',cv=5)
precision=precision_score(Y_test,y_pred,average='weighted')
f1=f1_score(Y_test,y_pred,average='weighted')
print(f'F1 score:{f1*100:.2f}')
print(f'precision:{precision*100:.2f}')
print(f'cv:{cv.mean():.2f}')
print(f'Accuracy:{acc*100:.2f}')

F1 score:97.17
precision:97.37
cv:0.97
Accuracy:97.29


In [41]:
Y_pred=pipeline_RF.predict(['Boltblue tones for 150p Reply POLY# or MONO# eg POLY3 1. Cha Cha Slide 2. Yeah 3. Slow Jamz 6. Toxic 8. Come With Me or STOP 4 more tones txt MORE	'])
Y_pred

array([0], dtype=int64)

In [42]:
import pickle
with open ('model_1.pkl', 'wb') as f:
    pickle.dump(pipeline_RF , f)