In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
sms = pd.read_table(url, header=None, names=['label', 'message'])
sms

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# Convert labels to binary values (0 for ham, 1 for spam)
sms['label'] = np.where(sms['label']=='spam', 1, 0)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sms['message'], 
                              sms['label'], 
                              random_state=42)

# Create a count vectorizer to convert messages into a matrix of token counts
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

# Train the Naive Bayes classifier on the training data
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Convert the test messages into a matrix of token counts
X_test_counts = vectorizer.transform(X_test)

# Predict the labels of the test messages
y_pred = clf.predict(X_test_counts)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Print the classification report
report = classification_report(y_test, y_pred)
print('Classification Report:\n', report)

Accuracy: 0.9885139985642498
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1207
           1       0.98      0.94      0.96       186

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393



#Streamlit

In [5]:
# Import necessary libraries
!pip install streamlit
import pandas as pd
import numpy as np
import streamlit as st
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
sms = pd.read_table(url, header=None, names=['label', 'message'])


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting streamlit
  Downloading streamlit-1.21.0-py2.py3-none-any.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog
  Downloading watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting validators>=0.2
  Downloading validators-0.20.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pympler>=0.9
  Downloading Pympler-1.0.1-py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB

In [6]:
# Convert labels to binary values 
#(0 for ham, 1 for spam)
sms['label'] = np.where(sms['label']=='spam', 1, 0)

# Train the Naive Bayes classifier on the full dataset
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sms['message'])
y = sms['label']
clf = MultinomialNB()
clf.fit(X, y)


In [8]:
# Define a function to predict the label of a message
def predict(message):
  message_counts = vectorizer.transform([message])
  return clf.predict(message_counts)[0]

# Create a Streamlit app
st.title('Spam Classifier')
message = st.text_input('Enter a message')
if st.button('Predict'):
  prediction = predict(message)
  if prediction == 1:
    st.error('This is a spam message')
  else:
    st.success('This is a legitimate message')
