## Spam classifier 
Dataset is taken from UCI Machine Learning Repository, SMS Spam Collection Data Set: https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

### Importing the dataset

In [6]:
import pandas as pd

messages = pd.read_csv('SMSSpamCollection', sep = '\t',names = ["label","message"])
messages.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
messages.shape

(5572, 2)

### Data cleaning and preprocessing

In [19]:
import nltk 
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
corpus = []

for i in range(0,len(messages)):
    #removing unnecesary characters
    review = re.sub('[^a-zA-Z]',' ', messages["message"][i])
    #lower all words in messages
    review = review.lower()
    #split the sentences to get list of words
    review = review.split()
    #remove stopwords from the list of words
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    #combine the words back into a single sentence
    review = ' '.join(review)
    corpus.append(review)


In [12]:
len(corpus)

5572

### Creating the Bag Of Words Model   

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)

X = cv.fit_transform(corpus).toarray()
X.shape

(5572, 5000)

### Alternative model for word vectorization: TF-IDF

In [32]:
##Uncomment this cell and comment out the cell above to use TF-IDF instead of Bag of Words
#from sklearn.feature_extraction.text import TfidfVectorizer
#cv = TfidfVectorizer()
#X = cv.fit_transform(corpus).toarray()

### Output data

In [38]:
y = pd.get_dummies(messages["label"])
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

### Splitting the train and test data

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

### Training model using Naive bayes classifier 

In [40]:
from sklearn.naive_bayes import MultinomialNB
spam_detection_model = MultinomialNB().fit(X_train,y_train)
y_pred = spam_detection_model.predict(X_test)
y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [41]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)
confusion_m

array([[946,   9],
       [  8, 152]], dtype=int64)

In [42]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9847533632286996