### Read Dataset

In [1]:
#Proses membaca dataset yang dimiliki menjadi dataframe

import pandas as pd
messages = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=["Label", "SMS"])

In [2]:
#Menampilkan sampel dataframe yang dimiliki

messages.head(5)

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#Menampilkan bentuk dimensi dari dataframe

print(messages.shape)

(5572, 2)


In [4]:
#Menampilkan jumlah pesan ham maupun spam

messages["Label"].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

### Split the dataset

In [5]:
#Mengacak dataframe yang dimiliki

randomized_data = messages.sample(frac = 1, random_state = 1)

In [6]:
#Membagi dataframe menjadi 2 bagian, yaitu data training dan data test
#Dimana data training berjumlah 80% dari total data yang dimiliki yaitu 4458
#Sedangkan data test berjumlah 20% dari total data yang dimiliki yaitu 1114

training_data = randomized_data[:4458].copy().reset_index(drop = True)
test_data = randomized_data[4458:].copy().reset_index(drop = True)

In [7]:
#Memeriksa persentase pesan ham dan spam pada data training

training_data["Label"].value_counts(normalize = 1) * 100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [8]:
#Memeriksa persentase pesan ham dan spam pada data test

test_data["Label"].value_counts(normalize = 1) * 100

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

### Data Cleaning

In [9]:
#Menampilkan data sebelum terjadinya data cleaning

training_data.head(5)

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [10]:
#Proses menghilangkan tanda baca dan lowercasing seluruh data

training_data["SMS"] = training_data["SMS"].str.replace("\W", " ", regex = True)
training_data["SMS"] = training_data["SMS"].str.lower()

In [11]:
#Menampilkan data setelah terjadinya proses punctuation dan lowercasing
training_data.head(5)

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


### Creating Vocabulary

In [12]:
#Mengubah pesan yang ada menjadi sebuah list dari kata-katanya

training_data["SMS"] = training_data["SMS"].str.split()

In [13]:
#Menampilkan bentuk data yang dimiliki sekarang

training_data.head(5)

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."


In [14]:
#menginisialisasi sebuah list kosong dan akan diisi dengan seluruh kata yang ada pada pesan

vocabulary = []
for message in training_data["SMS"]:
    for word in message:
        vocabulary.append(word)

In [15]:
# proses menghilangkan kata-kata yang memiliki jumlah lebih dari 1
# untuk menghindari adanya duplikat

vocabulary = set(vocabulary)
vocabulary = list(vocabulary)

In [16]:
# menampilkan total kata yang ada pada vocabulary

len(vocabulary)

7783

### Finalisasi Data Training

In [17]:
# membuat kamus dari data yang telah kita olah

word_counts_per_sms = {unique_word: [0] * len(training_data["SMS"]) for unique_word in vocabulary}

for i, sms in enumerate(training_data["SMS"]):
    for word in sms:
        word_counts_per_sms[word][i] += 1

In [18]:
# memeriksa nilai list dari beberapa kata yang ada di kamus

for word in vocabulary[:2]:
    print(word, word_counts_per_sms[word])

experiencehttp [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [19]:
# menampilkan data perhitungan dari setiap kata

word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,experiencehttp,subject,um,definitly,shouted,forward,achieve,senor,08718727870,causing,...,dare,alfie,nearby,gail,soooo,mone,interflora,sez,currently,janinexx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# menggabungkan dataframe "word_counts" dengan dataframe data training

training_data_full = pd.concat([training_data, word_counts], axis = 1)
training_data_full.head()

Unnamed: 0,Label,SMS,experiencehttp,subject,um,definitly,shouted,forward,achieve,senor,...,dare,alfie,nearby,gail,soooo,mone,interflora,sez,currently,janinexx
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Menghitung konstanta 


Menghitung probablitas suatu pesan adalah spam dan ham

Menggunakan laplace smoothing dengan alpa = 1

In [22]:
#menghitung probablitas suatu pesan ham dan ham
p_spam = training_data_full[training_data_full["Label"]=="spam"].shape[0]/training_data_full.shape[0]
p_ham = training_data_full[training_data_full["Label"]=="ham"].shape[0]/training_data_full.shape[0]

print("P(spam): {}".format(round(p_spam,7)))
print("P(ham): {}".format(round(p_ham,7)))

P(spam): 0.1345895
P(ham): 0.8654105


In [23]:
#menghitung NSpam
n_spam = training_data_full[training_data_full["Label"]=="spam"]["SMS"].apply(len).sum()
n_ham = training_data_full[training_data_full["Label"]=="ham"]["SMS"].apply(len).sum()

print(n_spam)
print(n_ham)

15190
57237


In [24]:
#jumlah vocabolary
n_vocabulary = len(vocabulary)
print(n_vocabulary)


#laplace dengan nilai alpha = 1
alpha = 1

7783


# Calculating parameters

Setelah itu menghitung parameter P(W|S) dan P(W|H) di mana setiap parameter memiliki nilai conditional probability yang terasosiasi dengan word pada vocabolary dengan rumus :

![Image](https://render.githubusercontent.com/render/math?math=P%28w_i%7CSpam%29%20%3D%20%5Cfrac%7BN_%7Bw_i%7CSpam%7D%20%2B%20%5Calpha%7D%7BN_%7BSpam%7D%20%2B%20%5Calpha%20%5Ccdot%20N_%7BVocabulary%7D%7D&mode=display)
![Image](https://render.githubusercontent.com/render/math?math=P%28w_i%7CHam%29%20%3D%20%5Cfrac%7BN_%7Bw_i%7CHam%7D%20%2B%20%5Calpha%7D%7BN_%7BHam%7D%20%2B%20%5Calpha%20%5Ccdot%20N_%7BVocabulary%7D%7D&mode=display)

In [25]:
#pertama inisiasikan dictionary untuk menyimpan nilai probability untuk setiap kata pada subset ham dan spam
word_probabilities_given_spam = {unique_word: 0 for unique_word in vocabulary}
word_probabilities_given_ham = {unique_word: 0 for unique_word in vocabulary}

#Split training set menjadi subset spam dan ham
training_data_spam = training_data_full[training_data_full["Label"]=="spam"]
training_data_ham = training_data_full[training_data_full["Label"]=="ham"]

In [26]:
#perhitungan P(Wi|Spam) dan P(Wi|Ham) untuk setiap vocabolary
for unique_word in vocabulary:
    word_probabilities_given_ham[unique_word] = (training_data_ham[unique_word].sum() + alpha) / (n_ham + (alpha * n_vocabulary))
    word_probabilities_given_spam[unique_word] = (training_data_spam[unique_word].sum() + alpha) / (n_spam + (alpha * n_vocabulary))

In [27]:
#Menampilkan nilai probabilitas dari beberapa kata

for word in vocabulary[:5]:
    print(word, word_probabilities_given_ham[word])

experiencehttp 1.537988311288834e-05
subject 1.537988311288834e-05
um 3.075976622577668e-05
definitly 3.075976622577668e-05
shouted 4.6139649338665025e-05


# Melakukan klasifikasi Pesan Baru

Setelah menghitung konstanta dan parameter maka dapat dibangun spam filter dengan proses sebagai berikut:
- Spam filter menerima input berupa pesan(w1,w2,...wn)
- Menghitung probabilitas pesan sebagai spam dan ham P(Spam|w1,w2,..wn)
- Memandingkan nilai probablitas Spam dan Ham dimana :
 - Jika P(H|w1,w2,...wn) > P(S|w1,w2,...wn) maka pesan itu ham
 - Jika P(S|w1,w2,...wn) > P(H|w1,w2,...wn) maka pesan itu spam
 - Jika P(H|w1,w2,...wn) = P(Spam|w1,w2,...wn) perlu bantuan manusia


# Main Application

In [28]:
#Membuat kelas "classify" yang berfungsi untuk mendeteksi pesan yang akan diberikan oleh user

import re

def classify():
    global messages
    
    #Menginisialisasi list result sebagai penampung input yang akan diberikan oleh user
    #dan label yang akan diprediksi oleh model
    result = []
    
    #Meminta input dari user
    message = input('Input your message : ')
    
    #Memasukkan inputan user ke dalam variabel result
    result.append(message)
    
    #Melakukan preprocessing terhadap inputan user agar dapat diolah lebih lanjut oleh model
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    #Proses menghitung probabilitas setiap kata dari inputan user
    for word in message:
        if word in word_probabilities_given_spam:
            p_spam_given_message *= word_probabilities_given_spam[word]
            
        if word in word_probabilities_given_ham:
            p_ham_given_message *= word_probabilities_given_ham[word]
    
    #Penentuan label terhadap inputan user
    if p_ham_given_message > p_spam_given_message:
        result.append("Ham")
    elif p_ham_given_message < p_spam_given_message:
        result.append("Spam")
    else:
        print('Equal probabilities, have a human classifiy this!')
        
    #Proses memasukkan inputan user dan prediksi kedalam dataset yang dimiliki
    t_df = pd.DataFrame({'Label': [result[1]], 'SMS': [result[0]]})
    messages = messages.append(t_df, ignore_index = True)
    
    #Menampilkan hasil prediksi kepada user
    print('This message is : ', result[1])

In [31]:
#Percobaan aplikasi pertama

classify()

Input your message : Congrats, this is your code for claim your prize : 2169 
This message is :  Spam


In [32]:
#Percobaan aplikasi kedua

classify()

Input your message : Hello 
This message is :  Ham


# Melakukan perhitungan Akurasi

In [34]:
#Akurasi Naive Bayes untuk Melakukan Spam Filtering
# Again, assumption: message is a string
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in word_probabilities_given_spam:
            p_spam_given_message *= word_probabilities_given_spam[word]
        
        if word in word_probabilities_given_ham:
            p_ham_given_message *= word_probabilities_given_ham[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [35]:
test_data['predicted'] = test_data['SMS'].apply(classify_test_set)
test_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [37]:
#Sekaranang dapat dibandingkan perbandingan nilai aktual ke seberapa baik program naive bayes ini dalam melakukan klasifikasi pesan
correct = 0
total = test_data.shape[0]

for row in test_data.iterrows():
    row = row[1]
    if row["Label"] == row["predicted"]:
        correct += 1
        
accuracy = correct / total

print(accuracy)

0.9874326750448833
