# **Step 1: Connect to Google Drive**

In [11]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Get the file from Google Drive using location
data_file = '/content/drive/My Drive/Colab Notebooks/SpamFilter/SMSSpamCollection.txt'

# ***Step 2: Load Dataset***

In [32]:
# Load Dataset
import pandas as pd
data = pd.read_csv(data_file, sep='\t', header=None, names=['Label', 'SMS'])
data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# ***Step 3: PreProcessing***

In [33]:
# Load Stopwords & Punctuation

import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
# See 10 stopwords
print(stopwords[:10])

# See all punctuation
print(punctuation)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


**PreProcess SMS Content**

In [35]:
# Function to convert in lowercase & remove punctuation 

def pre_process(SMS):
  # to lowercase & remove punctuation
  lowerCase = "".join([char.lower() for char in SMS if char not in punctuation])
  # Tokenize
  tokenize = nltk.tokenize.word_tokenize(lowerCase)
  # Remove Stopwords
  remove_stopwords = [word for word in tokenize if word not in stopwords]
  return remove_stopwords

In [36]:
# Creating modified new column 

data['processed'] = data['SMS'].apply(lambda x : pre_process(x))

data.head()

Unnamed: 0,Label,SMS,processed
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


# ***Step 4: Categorize ham/spam associated words***

In [38]:
# Function to categorizing ham/spam associated words

def categorize_words():
  spam_words = []
  ham_words = []

  #spam associated words
  for sms in data['processed'][data['Label'] == 'spam']:
    for word in sms:
      spam_words.append(word)

 #ham associated words
  for sms in data['processed'][data['Label'] == 'ham']:
    for word in sms:
      ham_words.append(word)

  return spam_words, ham_words

spam_words, ham_words = categorize_words()

print(spam_words[:5])
print(ham_words[:5])
  

['free', 'entry', '2', 'wkly', 'comp']
['go', 'jurong', 'point', 'crazy', 'available']


# ***Step 5: Predict Function***

In [57]:
# Iterate over all the words from the user input and count their occurances in both ham_words and spam_words

def predict(user_input):
  spam_counter = 0
  ham_counter = 0

  for word in user_input:
    spam_counter += spam_words.count(word)
    ham_counter += ham_words.count(word)

  print('******* Result ********')
  if ham_counter > spam_counter:
    # Adding accuracy
    accuracy = round((ham_counter / (ham_counter + spam_counter)) * 100, 2)
    print('SMS is NOT Spam with {}% accuracy'.format(accuracy))
  elif ham_counter < spam_counter:
    accuracy = round((spam_counter / (ham_counter + spam_counter)) * 100,2)
    print('SMS is Spam with {}% accuracy'.format(accuracy))
  else:
    print('Could be Spam with 50% accuracy')

# ***Step 6: Collect User Input and Predict Result***

In [58]:
# Taking user input

user_input = input('Type SMS to check Spam or Ham\n')

Type SMS to check Spam or Ham
I have to stay focused


In [59]:
# PreProcess the user input SMS

processed_Input = pre_process(user_input)

In [60]:
# Now Predict

predict(processed_Input)

******* Result ********
SMS is NOT Spam with 86.96% accuracy
