In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import streamlit as st
import joblib

In [2]:
!pip install streamlit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
data = pd.read_csv('mail_data.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.dropna(inplace=True)
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
data.loc[data['Category'] == 'spam', 'Category',] = 1
data.loc[data['Category'] == 'ham', 'Category',] = 0

In [8]:
X_train, y_train = data['Message'], data['Category']
X_train, X_test, Y_train, Y_test = train_test_split(X_train, y_train, test_size=0.08, random_state=2)

In [9]:
Y_train

4622    0
498     0
622     0
1706    0
2064    1
       ..
3335    0
1099    0
2514    1
3606    0
2575    1
Name: Category, Length: 5126, dtype: object

In [10]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [11]:
print(X_train_features)

  (0, 856)	0.5959476512223884
  (0, 7425)	0.5756432594547182
  (0, 5847)	0.5598939496430254
  (1, 7328)	0.4426573964274953
  (1, 2042)	0.49734777646506134
  (1, 832)	0.6307708913803873
  (1, 7222)	0.39853193249786434
  (2, 7173)	0.3500372526404082
  (2, 2464)	0.4801445679332877
  (2, 4215)	0.30032026603791684
  (2, 7861)	0.7104173686703011
  (2, 3320)	0.22814472544109343
  (3, 7629)	0.3648990096354639
  (3, 6619)	0.3648990096354639
  (3, 7656)	0.2946027233894455
  (3, 6169)	0.3648990096354639
  (3, 5036)	0.3481794183761367
  (3, 1604)	0.3195970883315357
  (3, 7829)	0.3481794183761367
  (3, 927)	0.24930080208551733
  (3, 8002)	0.32711522453216274
  (4, 339)	0.22400789377728822
  (4, 1526)	0.2991024707397885
  (4, 1133)	0.2588133105121478
  (4, 248)	0.3245221006738148
  :	:
  (5123, 727)	0.2884770073267168
  (5123, 1186)	0.2473433851776942
  (5123, 7791)	0.18235805328605972
  (5123, 4982)	0.384714168441942
  (5123, 2422)	0.25094301768178684
  (5123, 1692)	0.2076044427855003
  (5123, 6250

In [12]:
model = SGDClassifier()
model.fit(X_train_features, Y_train)

SGDClassifier()

In [13]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
accuracy_on_training_data

0.9996098322278579

In [14]:
input_data = ["Hi my name is ishan"]
input_data = feature_extraction.transform(input_data)
print(f'This email is {"Spam" if model.predict(input_data) else "Not Spam"}')

This email is Not Spam


In [21]:
joblib.dump(feature_extraction, 'feature_extraction.pkl')

['feature_extraction.pkl']

In [16]:
joblib.dump(model, 'model.pkl')

['model.pkl']

In [23]:
c = joblib.load('model.pkl')
c1 = joblib.load('feature_extraction.pkl')

In [24]:
c.predict(c1.transform(['hi my name is ishan']))

array([0])

In [27]:
!streamlit run stream_app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.23.236.35:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m


In [20]:
__name__

'__main__'