# **SMS SPAM DETECTION: PREPROCESSING DATA**

## **IMPORT LIBRARY**

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

## **LOAD DATASET**

In [None]:
df = pd.read_csv('sms_spam_indo.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Kategori  1143 non-null   object
 1   Pesan     1143 non-null   object
dtypes: object(2)
memory usage: 18.0+ KB


In [None]:
df.head()

Unnamed: 0,Kategori,Pesan
0,spam,Plg Yth: Simcard anda mendptkan bonus poin plu...
1,ham,Iya ih ko sedih sih gtau kapan lg ke bandung :(
2,ham,Kalau mau bikin model/controller mending per a...
3,ham,Selamat nama1. Semoga selalu menempuh hidup ya...
4,spam,Tingkatkan nilai isi ulang Anda selanjutnya mi...


In [None]:
print(df['Pesan'].iloc[35])

Ini hujan semalaman ruang tengah sama kamar kayanya gabisa di nyalain (:() harus nunggu kering baru bisa nyala, smg ujannya berenti 🙏


## **DATA CLEANING**

### **Text Cleaning**

In [None]:
def remove(text):
  text = re.sub('\n', ' ', text)
  text = re.sub('[^a-zA-Z]', ' ', text)
  text = re.sub('((@[^\s]+)|(#[^\s]+))', ' ', text)
  text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)
  text = re.sub(' +', ' ', text)
  text = text.lower()
  return text

df['Pesan'] = df['Pesan'].apply(remove)

In [None]:
df.head()

Unnamed: 0,Kategori,Pesan
0,spam,plg yth simcard anda mendptkan bonus poin plus...
1,ham,iya ih ko sedih sih gtau kapan lg ke bandung
2,ham,kalau mau bikin model controller mending per a...
3,ham,selamat nama semoga selalu menempuh hidup yang...
4,spam,tingkatkan nilai isi ulang anda selanjutnya mi...


In [None]:
print(df['Pesan'].iloc[35])

ini hujan semalaman ruang tengah sama kamar kayanya gabisa di nyalain harus nunggu kering baru bisa nyala smg ujannya berenti 


In [None]:
df_slang = pd.read_csv('colloquial-indonesian-lexicon.csv')
df_slang = df_slang[['slang', 'formal']]
df_slang.head()

Unnamed: 0,slang,formal
0,woww,wow
1,aminn,amin
2,met,selamat
3,netaas,menetas
4,keberpa,keberapa


In [None]:
slang_dict = {df_slang.slang[i]: df_slang.formal[i] for i in range(0, len(df_slang.slang.values))}

In [None]:
def replace_slang(sentence):
    tokens = sentence.split()
    for i, token in enumerate(tokens):
        if token in slang_dict.keys():
            tokens[i] = slang_dict[token]
    return ' '.join(tokens)

df['Pesan'] = df['Pesan'].apply(replace_slang)
df.head()

Unnamed: 0,Kategori,Pesan
0,spam,pulang yth simcard anda mendptkan bonus poin p...
1,ham,iya ih kok sedih sih enggak tau kapan lagi ke ...
2,ham,kalau mau bikin model controller mending per a...
3,ham,selamat nama semoga selalu menempuh hidup yang...
4,spam,tingkatkan nilai isi ulang anda selanjutnya mi...


### **Encoding Target/Label**

In [None]:
def classify(kategori):
  if kategori == 'spam':
    return 1
  elif kategori == 'ham':
    return 0

In [None]:
df["Kategori"] = df["Kategori"].apply(classify)
df.head()

Unnamed: 0,Kategori,Pesan
0,1,pulang yth simcard anda mendptkan bonus poin p...
1,0,iya ih kok sedih sih enggak tau kapan lagi ke ...
2,0,kalau mau bikin model controller mending per a...
3,0,selamat nama semoga selalu menempuh hidup yang...
4,1,tingkatkan nilai isi ulang anda selanjutnya mi...


In [None]:
df = df[['Pesan', 'Kategori']]
df.head()

Unnamed: 0,Pesan,Kategori
0,pulang yth simcard anda mendptkan bonus poin p...,1
1,iya ih kok sedih sih enggak tau kapan lagi ke ...,0
2,kalau mau bikin model controller mending per a...,0
3,selamat nama semoga selalu menempuh hidup yang...,0
4,tingkatkan nilai isi ulang anda selanjutnya mi...,1


### **Splitting Train, Test, and Val Dataset**

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)

In [None]:
df_train.to_csv('train.csv', index=False)
df_val.to_csv('validation.csv', index=False)
df_test.to_csv('testing.csv', index=False)

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 731 entries, 794 to 950
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Pesan     731 non-null    object
 1   Kategori  731 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 17.1+ KB


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 229 entries, 158 to 966
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Pesan     229 non-null    object
 1   Kategori  229 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.4+ KB


In [None]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 183 entries, 393 to 129
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Pesan     183 non-null    object
 1   Kategori  183 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.3+ KB
