In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('dataset/cleaned_dataset.csv')
data.head()

Unnamed: 0,text,label
0,naturally irresistible your corporate identit...,spam
1,the stock trading gunslinger fanny is merril...,spam
2,unbelievable new homes made easy im wanting ...,spam
3,4 color printing special request additional ...,spam
4,"do not have money , get software cds from her...",spam


In [3]:
data['label'] = data['label'].replace(['spam', 'not spam'], [1, 0])

In [4]:
data.head()

Unnamed: 0,text,label
0,naturally irresistible your corporate identit...,1
1,the stock trading gunslinger fanny is merril...,1
2,unbelievable new homes made easy im wanting ...,1
3,4 color printing special request additional ...,1
4,"do not have money , get software cds from her...",1


In [5]:
data.shape

(14299, 2)

In [6]:
data.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
data.duplicated().sum()

575

In [8]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()

0

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13724 entries, 0 to 14298
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    13724 non-null  object
 1   label   13724 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 321.7+ KB


In [10]:
data['label'] = data['label'].astype(str)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13724 entries, 0 to 14298
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    13724 non-null  object
 1   label   13724 non-null  object
dtypes: object(2)
memory usage: 321.7+ KB


In [12]:
data['label'] = data['label'].apply(lambda x: "text_result" if len(x) > 1 else x) 
data = data[data['label'] != "text_result"]

In [13]:
data['label'] = data['label'].astype(int)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13724 entries, 0 to 14298
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    13724 non-null  object
 1   label   13724 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 268.0+ KB


In [15]:
data.iloc[15].text

' search engine position  be the very first listing in the top search engines immediately .  our company will now place any business with a qualified website  permanently at the top of the major search engines guaranteed never to move  ( ex : yahoo ! , msn , alta vista , etc . ) . this promotion includes unlimited  traffic and is not going to last long . if you are interested in being  guaranteed first position in the top search engines at a promotional fee ,  please contact us promptly to find out if you qualify via email at  searchl 1 @ telefonica . net . pe it \' s very important to include the url ( s ) if you  are interested in promoting ! ! ! this is not pay per click . examples will  be provided .  this promotion is only valid in the usa and canada .  sincerely ,  the search engine placement specialists  if you wish to be removed from this list , please respond to the following  email address and type the word " remove " in your subject line :  search 6 @ speedy . com . pe'

In [16]:
data['text'] = data['text'].apply(lambda x: x.lower())

In [17]:
data['text'][0]

" naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  marketing 

In [18]:
def rem_special_chars(text):
    new_text = ""
    for i in text:
        if i.isalnum() or i == " ":
            new_text += i
    return new_text.strip()

In [19]:
data['text'] = data['text'].apply(rem_special_chars)

In [20]:
data

Unnamed: 0,text,label
0,naturally irresistible your corporate identity...,1
1,the stock trading gunslinger fanny is merrill...,1
2,unbelievable new homes made easy im wanting t...,1
3,4 color printing special request additional i...,1
4,do not have money get software cds from here ...,1
...,...,...
14294,this is the 2nd time we have tried 2 contact u...,1
14295,will ü b going to esplanade fr home,0
14296,pity was in mood for that soany other suggest...,0
14297,the guy did some bitching but i acted like id ...,0


In [21]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', max_features=10000)

In [22]:
X = cv.fit_transform(data['text']).toarray()
y = data['label'].values
y

array([1, 1, 1, ..., 0, 0, 0])

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10979, 10000)
(2745, 10000)
(10979,)
(2745,)


## Naive Bayes

In [24]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)

In [25]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9544626593806922

## SVM

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

X_train_list = [' '.join(map(str, row)) for row in X_train]
X_test_list = [' '.join(map(str, row)) for row in X_test]

tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_list)
X_test_tfidf = tfidf_vectorizer.transform(X_test_list)

svm_classifier = SVC(kernel = 'linear')
svm_classifier.fit(X_train_tfidf, y_train)

In [27]:
# Predictions
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm}")

SVM Accuracy: 0.8324225865209471


## Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

In [29]:
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf}")

Random Forest Accuracy: 0.8331511839708561


## Gradient Boosting

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boosting Model
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
gb_classifier.fit(X_train_tfidf, y_train)

In [31]:
# Predictions
y_pred_gb = gb_classifier.predict(X_test_tfidf)

# Accuracy
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Gradient Boosting Accuracy: {accuracy_gb}")

Gradient Boosting Accuracy: 0.8309653916211294


## K-Nearest Neighbors (KNN)

In [32]:
from sklearn.neighbors import KNeighborsClassifier

# KNN Model
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_tfidf, y_train)

In [33]:
# Predictions
y_pred_knn = knn_classifier.predict(X_test_tfidf)

# Accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"K-Nearest Neighbors Accuracy: {accuracy_knn}")

K-Nearest Neighbors Accuracy: 0.8306010928961749


#### Border

In [34]:
feature_names = cv.get_feature_names_out()
print(len(feature_names))

10000


In [35]:
len(cv.get_stop_words())

318

In [36]:
import pickle

pickle.dump(cv, open('dataset/cv_terbaik.pkl', 'wb'))
pickle.dump(clf, open('dataset/clf_terbaik.pkl', 'wb'))