# Import data-set

In [2]:
data_file = 'C:/Users/ISH KAPOOR/Desktop/SMS_Spam_Filter/SMSSpamCollection.txt'

# Read data-set

In [3]:
import pandas as pd

In [6]:
data = pd.read_csv(data_file, sep = '\t', header = None, names = ['label', 'sms'])
data.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Load stopwords and punctuations


In [9]:
import string
import nltk

nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

print(punctuation)
print(stopwords[:5])

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to C:\Users\ISH
[nltk_data]     KAPOOR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\ISH
[nltk_data]     KAPOOR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Stop Words
### In computing, stop words are words which are filtered out before or after processing of natural language data (text). Though "stop words" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.

# Pre-process SMS content

In [22]:
def pre_process(sms):

    lowercase = "".join([char.lower() for char in sms if char not in punctuation]) # Filters out punctuations and uppercase
    tokenize = nltk.tokenize.word_tokenize(lowercase)
    remove_stopwords = [word for word in tokenize if word not in stopwords]
    return remove_stopwords

In [23]:
data['processed'] = data['sms'].apply(lambda x: pre_process(x))
data.head()

Unnamed: 0,label,sms,processed
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


# Categorize Ham/Spam associated words

In [24]:
def categorize_words():

    spam_words = []
    ham_words = []

    for sms in data['processed'][data['label'] == 'spam']:
        for word in sms:
            spam_words.append(word)

    for sms in data['processed'][data['label'] == 'ham']:
        for word in sms:
            ham_words.append(word)

    return spam_words, ham_words

In [25]:
spam_words, ham_words = categorize_words()

In [26]:
print(spam_words[:5])
print(ham_words[:5])

['free', 'entry', '2', 'wkly', 'comp']
['go', 'jurong', 'point', 'crazy', 'available']


In [37]:
def predict(user_input):

    spam_counter = 0
    ham_counter = 0

    for word in user_input:
        spam_counter += spam_words.count(word)
        ham_counter += ham_words.count(word)
    print("#######################################################################################\n\t\t\\t\t\t\tRESULTS\n#######################################################################################")
    if ham_counter > spam_counter:
        accuracy = round((ham_counter / (ham_counter + spam_counter)) * 100, 2)
        print("Message is not spam with {}% accuracy.".format(accuracy))
    elif ham_counter < spam_counter:
        accuracy = round((spam_counter / (ham_counter + spam_counter)) * 100, 2)
        print("Message is spam with {}% accuracy.".format(accuracy))
    else:
        print("It could be spam with 50% accuracy.")

In [38]:
user_input = input("Enetr a Spam/Ham message:\n")
processed_input = pre_process(user_input)
predict(processed_input)

Enetr a Spam/Ham message:
do you need to use the laptop today? Can I borrow it from you?
#######################################################################################
		\t			RESULTS
#######################################################################################
Message is not spam with 90.71% accuracy.
