<a href="https://colab.research.google.com/github/joe94113/SimpleSmsSpamFilter/blob/main/SimpleSmsSpamFilter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
# connect to google drive
# Data Link:http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
data_file = "/content/drive/MyDrive/Colab Notebooks/SMSSpamCollection.txt"

In [26]:
import pandas as pd

data = pd.read_csv(data_file, sep = "\t", header=None, names=["label", "sms"])
data.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
# load stopwords and punctuation

import string
import nltk
nltk.download("stopwords")
nltk.download("punkt")

# english 停用詞
stopwords = nltk.corpus.stopwords.words("english")
# 標點符號
punctuation = string.punctuation
print(stopwords[:5])
print(punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['i', 'me', 'my', 'myself', 'we']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [42]:
# pre-process sms content
def pre_process(sms):
  # 把字符都變成小寫，以及去除標點符號
  lowercase = "".join([char.lower() for char in sms if char not in punctuation])
  # remove_punct = "".join([char for char in sms if char not in punctuation])
  # 使用word_tokenize斷詞
  tokenize = nltk.tokenize.word_tokenize(lowercase)
  remove_stopwords = [word for word in tokenize if word not in stopwords]
  return remove_stopwords
data["processed"] = data["sms"].apply(lambda x :pre_process(x))
data.head()

Unnamed: 0,label,sms,processed
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [43]:
# categorizing ham/spam associated words

def categorize_words():
  spam_words = []
  ham_words = []

  # spam associated words
  for sms in data["processed"][data["label"] == "spam"]:
    for word in sms:
      spam_words.append(word)

  # ham associated words
  for sms in data["processed"][data["label"] == "ham"]:
    for word in sms:
      ham_words.append(word)
  
  return spam_words, ham_words

spam_words, ham_words = categorize_words()
print(spam_words[:5])
print(ham_words[:5])

['free', 'entry', '2', 'wkly', 'comp']
['go', 'jurong', 'point', 'crazy', 'available']


In [59]:
# itterate over all the words from the user input and count their occurances in both ham_words and spam_words

def predict(user_inout):
  spam_counter = 0
  ham_counter = 0

  for word in user_input:
    spam_counter += spam_words.count(word)
    ham_counter += ham_words.count(word)
  print("-------------------*result*--------------------")
  if ham_counter > spam_counter:
    # adding accuracy
    # 顯示到小數點後兩位
    accuracy = round((ham_counter / (ham_counter+spam_counter)) * 100, 2)
    print(f"message is not spam, with {accuracy} accuracy")
  elif spam_counter > ham_counter:
    accuracy = round((spam_counter / (ham_counter+spam_counter)) * 100, 2)
    print(f"message is spam, with {accuracy} accuracy")
  else:
    print("message could be spam, with 50% accuracy")

In [57]:
# collect uer input

user_input = input("please type a spam or ham message to check if our funtion predicts properly\n")

please type a spam or ham message to check if our funtion predicts properly
i am handsome


In [60]:
processed_input = pre_process(user_input)
predict(processed_input)

-------------------*result*--------------------
message is not spam, with 93.59 accuracy
