### Read Dataset

In [1]:
#Proses membaca dataset yang dimiliki menjadi dataframe

import pandas as pd
messages = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=["Label", "SMS"])

In [2]:
#Menampilkan sampel dataframe yang dimiliki

messages.head(5)

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#Menampilkan bentuk dimensi dari dataframe

print(messages.shape)

(5572, 2)


In [4]:
#Menampilkan jumlah pesan ham maupun spam

messages["Label"].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

### Split the dataset

In [5]:
#Mengacak dataframe yang dimiliki

randomized_data = messages.sample(frac = 1, random_state = 1)

In [6]:
#Membagi dataframe menjadi 2 bagian, yaitu data training dan data test
#Dimana data training berjumlah 80% dari total data yang dimiliki yaitu 4458
#Sedangkan data test berjumlah 20% dari total data yang dimiliki yaitu 1114

training_data = randomized_data[:4458].copy().reset_index(drop = True)
test_data = randomized_data[4458:].copy().reset_index(drop = True)

In [7]:
#Memeriksa persentase pesan ham dan spam pada data training

training_data["Label"].value_counts(normalize = 1) * 100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [8]:
#Memeriksa persentase pesan ham dan spam pada data test

test_data["Label"].value_counts(normalize = 1) * 100

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

### Data Cleaning

In [9]:
#Menampilkan data sebelum terjadinya data cleaning

training_data.head(5)

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [10]:
#Proses menghilangkan tanda baca dan lowercasing seluruh data

training_data["SMS"] = training_data["SMS"].str.replace("\W", " ", regex = True)
training_data["SMS"] = training_data["SMS"].str.lower()

In [11]:
#Menampilkan data setelah terjadinya proses punctuation dan lowercasing
training_data.head(5)

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


### Creating Vocabulary

In [12]:
#Mengubah pesan yang ada menjadi sebuah list dari kata-katanya

training_data["SMS"] = training_data["SMS"].str.split()

In [13]:
#Menampilkan bentuk data yang dimiliki sekarang

training_data.head(5)

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."
