### PANDAS PART

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('spam.csv')
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.unique()

array(['ham', 'spam'], dtype=object)

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [6]:
df.head(5)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### SKLEARN TRAIN_TEST_SPLIT PART

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.Message,df.spam,test_size=0.2,random_state=42)

In [8]:
len(x_train)

4457

In [9]:
 len(x_test)

1115

### COUNT VECTORIZER PART

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
cv_messages = v.fit_transform(x_train.values)
cv_messages.toarray()[0:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### NAIVE BAYES PART

In [11]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [12]:
model.fit(cv_messages,y_train)

MultinomialNB()

In [13]:
email = [
    'Upto 30% discount on parking, exclusive offer just for you. Dont miss thi reward!',
    'Hi gowtham send me the project details '
]
email_count = v.transform(email)
model.predict(email_count)

array([1, 0], dtype=int64)

In [14]:
x_test_count = v.transform(x_test)
model.score(x_test_count,y_test)

0.9919282511210762

### SKLEARN PIPELINE PART

In [15]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
]
)
clf.fit(x_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [16]:
email = [
    'Upto 30% discount on parking, exclusive offer just for you. Dont miss thi reward!',
    'Hi gowtham send me the project details '
]
clf.predict(email)

array([1, 0], dtype=int64)

In [17]:
clf.score(x_test,y_test)

0.9919282511210762

In [21]:
import joblib
joblib.dump(clf, 'spam_model.pkl')

['spam_model.pkl']