In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:
gist_url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
df = pd.read_csv(gist_url, sep='\t')

df.columns = ['target', 'text']
df = df[df['target'].isin(['ham', 'spam'])]
df['target'] = df['target'].map({'ham': 0, 'spam': 1})

In [None]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['target']
)

In [None]:
print(df.head(20))

    target                                               text
0        0                      Ok lar... Joking wif u oni...
1        1  Free entry in 2 a wkly comp to win FA Cup fina...
2        0  U dun say so early hor... U c already then say...
3        0  Nah I don't think he goes to usf, he lives aro...
4        1  FreeMsg Hey there darling it's been 3 week's n...
5        0  Even my brother is not like to speak with me. ...
6        0  As per your request 'Melle Melle (Oru Minnamin...
7        1  WINNER!! As a valued network customer you have...
8        1  Had your mobile 11 months or more? U R entitle...
9        0  I'm gonna be home soon and i don't want to tal...
10       1  SIX chances to win CASH! From 100 to 20,000 po...
11       1  URGENT! You have won a 1 week FREE membership ...
12       0  I've been searching for the right words to tha...
13       0                I HAVE A DATE ON SUNDAY WITH WILL!!
14       1  XXXMobileMovieClub: To use your credit, click ...
15      

In [None]:
def training(df_train):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(df_train['text'])
    y_train = df_train['target']

    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42
    )
    model.fit(X_train, y_train)

    return model, vectorizer

In [None]:
def testing(model, vectorizer, df_test):
    X_test = vectorizer.transform(df_test['text'])
    y_test = df_test['target']
    y_pred = model.predict(X_test)
    return y_test, y_pred


In [None]:
# training
model, vectorizer = training(df_train)

# testing
X_test = vectorizer.transform(df_test['text'])
y_test = df_test['target']

# prediksi
y_pred = model.predict(X_test)


In [None]:
model, vectorizer = training(df_train)
y_test, y_pred = testing(model, vectorizer, df_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9730941704035875
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.80      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
def predict_sms(model, vectorizer, text):
    X = vectorizer.transform([text])
    pred = model.predict(X)[0]
    return "SPAM" if pred == 1 else "HAM"

print(predict_sms(model, vectorizer, "Your account has been selected for a prize!"))

HAM
