### Read Dataset

In [2]:
#Proses membaca dataset yang dimiliki menjadi dataframe

import pandas as pd
messages = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=["Label", "SMS"])

In [3]:
#Menampilkan sampel dataframe yang dimiliki

messages.head(5)

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#Menampilkan bentuk dimensi dari dataframe

print(messages.shape)

(5572, 2)


In [5]:
#Menampilkan jumlah pesan ham maupun spam

messages["Label"].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

### Split the dataset

In [6]:
#Mengacak dataframe yang dimiliki

randomized_data = messages.sample(frac = 1, random_state = 1)

In [7]:
#Membagi dataframe menjadi 2 bagian, yaitu data training dan data test
#Dimana data training berjumlah 80% dari total data yang dimiliki yaitu 4458
#Sedangkan data test berjumlah 20% dari total data yang dimiliki yaitu 1114

training_data = randomized_data[:4458].copy().reset_index(drop = True)
test_data = randomized_data[4458:].copy().reset_index(drop = True)

In [8]:
#Memeriksa persentase pesan ham dan spam pada data training

training_data["Label"].value_counts(normalize = 1) * 100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [9]:
#Memeriksa persentase pesan ham dan spam pada data test

test_data["Label"].value_counts(normalize = 1) * 100

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

### Data Cleaning

In [10]:
#Menampilkan data sebelum terjadinya data cleaning

training_data.head(5)

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [11]:
#Proses menghilangkan tanda baca dan lowercasing seluruh data

training_data["SMS"] = training_data["SMS"].str.replace("\W", " ", regex = True)
training_data["SMS"] = training_data["SMS"].str.lower()

In [12]:
#Menampilkan data setelah terjadinya proses punctuation dan lowercasing
training_data.head(5)

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


### Creating Vocabulary

In [13]:
#Mengubah pesan yang ada menjadi sebuah list dari kata-katanya

training_data["SMS"] = training_data["SMS"].str.split()

In [14]:
#Menampilkan bentuk data yang dimiliki sekarang

training_data.head(5)

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."


In [15]:
#menginisialisasi sebuah list kosong dan akan diisi dengan seluruh kata yang ada pada pesan

vocabulary = []
for message in training_data["SMS"]:
    for word in message:
        vocabulary.append(word)

In [16]:
# proses menghilangkan kata-kata yang memiliki jumlah lebih dari 1
# untuk menghindari adanya duplikat

vocabulary = set(vocabulary)
vocabulary = list(vocabulary)

In [17]:
# menampilkan total kata yang ada pada vocabulary

len(vocabulary)

7783

### Finalisasi Data Training

In [19]:
# membuat kamus dari data yang telah kita olah

word_counts_per_sms = {unique_word: [0] * len(training_data["SMS"]) for unique_word in vocabulary}

for i, sms in enumerate(training_data["SMS"]):
    for word in sms:
        word_counts_per_sms[word][i] += 1

In [20]:
# memeriksa nilai list dari beberapa kata yang ada di kamus

for word in vocabulary[:2]:
    print(word, word_counts_per_sms[word])

hol [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [21]:
# menampilkan data perhitungan dari setiap kata

word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,hol,resume,fone,korli,2309,creative,theacusations,ela,gee,presnts,...,cares,eppolum,transfred,super,wined,16,3mins,trade,newest,cld
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# menggabungkan dataframe "word_counts" dengan dataframe data training

training_data_full = pd.concat([training_data, word_counts], axis = 1)
training_data_full.head()

Unnamed: 0,Label,SMS,hol,resume,fone,korli,2309,creative,theacusations,ela,...,cares,eppolum,transfred,super,wined,16,3mins,trade,newest,cld
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
