# Develop a simple Message Classifier

### libraries

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib

In [4]:
spam_data = pd.read_csv('./data/spam.csv', encoding="latin-1")
spam_data.sample(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [14]:
spam_data.iloc[2].v2

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [7]:
# Check the v1 column values
spam_data.v1.unique()

array(['ham', 'spam'], dtype=object)

**Observation:** we have two labels, which corresponds to a 2-class classification problem.  
Then, we need to encode those two labels into numerical format for model training.  

In [8]:
# Encode the ham and spam
spam_data['label'] = spam_data['v1'].map({'ham': 0, 'spam': 1})
spam_data.sample(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,label
1975,ham,Delhi and chennai still silent.,,,,0
2139,ham,But i juz remembered i gotta bathe my dog today..,,,,0
3760,ham,Was just about to ask. Will keep this one. May...,,,,0


**Observation:** we created the new column and we are now going to keep the following two columns:  
- **v2** for X   
- **label**  for y

In [10]:
X = spam_data['v2']
y = spam_data['label']

### Model Training.    
Since the goal is mainly about deploying the model with streamlit, we won't spend time on finding the best model, but a very simple one (Naive Bayes in our case), in order to continue the deployment process.  

In [11]:
cv = CountVectorizer()
X = cv.fit_transform(X) # Fit the Data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2021)

#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1594
           1       0.95      0.93      0.94       245

    accuracy                           0.98      1839
   macro avg       0.97      0.96      0.96      1839
weighted avg       0.98      0.98      0.98      1839



In [12]:
# Persist the model
joblib.dump(clf, './models/spam_detector_model.pkl')

['./models/spam_detector_model.pkl']