### Spam Detection with Machine Learning

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/amankharwal/SMS-Spam-Detection/master/spam.csv", encoding= 'latin-1')
data.head()

Unnamed: 0,class,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = data[["class", "message"]]

In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
X=np.array(df['message'])
y=np.array(df['class'])

In [7]:
cv=CountVectorizer()

In [8]:
X=cv.fit_transform(X)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [10]:
model = MultinomialNB()
model.fit(X_train,y_train)

MultinomialNB()

In [12]:
sample=input('enter a message:')
data=cv.transform([sample]).toarray()
model.predict(data)

enter a message:You won $40 cash price


array(['spam'], dtype='<U4')

### Theroy behind it

In [13]:
# first we will import necessry library
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [15]:
# our text is
text = ['Hello my name is james',
'james this is my python notebook',
'james trying to create a big dataset',
'james of words to try differnt',
'features of count vectorizer']

In [16]:
# first we need to convert our text data to matrix using CountVectorizer.fit_transform(text)
text_matrix=cv.fit_transform(text)

In [17]:
# then change it to array
text_array=text_matrix.toarray()

In [18]:
df=pd.DataFrame(data=text_array,columns = cv.get_feature_names())

In [19]:
df

Unnamed: 0,big,count,create,dataset,differnt,features,hello,is,james,my,name,notebook,of,python,this,to,try,trying,vectorizer,words
0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,1,1,0,1,0,1,1,0,0,0,0,0
2,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
3,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1
4,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0


##### Count vectorizer changes all words to lowercase by default, u can change it

In [20]:
text = ['Hello my name is james', 'james this is my python notebook']

In [21]:
cv=CountVectorizer(lowercase=False)

In [22]:
text_matrix=cv.fit_transform(text)

In [23]:
text_matrix

<2x8 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [24]:
text_array=text_matrix.toarray()

In [25]:
df=pd.DataFrame(text_array,columns=cv.get_feature_names())

In [26]:
df.head()

Unnamed: 0,Hello,is,james,my,name,notebook,python,this
0,1,1,1,1,1,0,0,0
1,0,1,1,1,0,1,1,1


#### Stop_words:a parameter of CountVectorizer()

In [27]:
text = ['Hello my name is james',
'james this is my python notebook',
'james trying to create a big dataset',
'james of words to try differnt',
'features of count vectorizer']

In [28]:
cv=CountVectorizer(stop_words=['is','to','my'])

In [29]:
cv_matrix=cv.fit_transform(text)

In [30]:
cv_array=cv_matrix.toarray()

In [31]:
df=pd.DataFrame(cv_array,columns=cv.get_feature_names())

In [32]:
df

Unnamed: 0,big,count,create,dataset,differnt,features,hello,james,name,notebook,of,python,this,try,trying,vectorizer,words
0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0
2,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1
4,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0


###  Sklearn built in stop words list

In [34]:
# just pass 'english' as value to stop_words inside the Countvectorizer

cv = CountVectorizer(stop_words='english')

In [35]:
cv_matrix=cv.fit_transform(text)
cv_array=cv_matrix.toarray()
df=pd.DataFrame(cv_array,columns=cv.get_feature_names())

In [36]:
df

Unnamed: 0,big,count,create,dataset,differnt,features,hello,james,notebook,python,try,trying,vectorizer,words
0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,1,1,0,0,0,0
2,1,0,1,1,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,1,0,0,1,0,0,1,0,0,1
4,0,1,0,0,0,1,0,0,0,0,0,0,1,0


### Using max_df and min_df (covered later)

In [37]:
cv=CountVectorizer(max_df=1)

In [38]:
cv_matrix=cv.fit_transform(text)
cv_array=cv_matrix.toarray()
df=pd.DataFrame(cv_array,columns=cv.get_feature_names())

In [39]:
df

Unnamed: 0,big,count,create,dataset,differnt,features,hello,name,notebook,python,this,try,trying,vectorizer,words
0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
2,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1
4,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0


### min_df

In [44]:
cv=CountVectorizer(min_df=2)

In [45]:
cv_matrix=cv.fit_transform(text)
cv_array=cv_matrix.toarray()
df=pd.DataFrame(cv_array,columns=cv.get_feature_names())

In [46]:
df

Unnamed: 0,is,james,my,of,to
0,1,1,1,0,0
1,1,1,1,0,0
2,0,1,0,0,1
3,0,1,0,1,1
4,0,0,0,1,0


#### Using percentage:

In [40]:
text = ['Hello my name is james',
'james this is my python notebook',
'james trying to create a big dataset',
'james of words to try differnt',
'features of count vectorizer']

In [42]:
cv = CountVectorizer(max_df=0.75)
cv_matrix = cv.fit_transform(text)
cv_array = cv_matrix.toarray()
df = pd.DataFrame(data=cv_array,columns = cv.get_feature_names())

In [43]:
df

Unnamed: 0,big,count,create,dataset,differnt,features,hello,is,my,name,notebook,of,python,this,to,try,trying,vectorizer,words
0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0
2,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1
4,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0


#### max_features

In [47]:
text = ['This is the first document.','This document is the second document.',
        'And this is the third one.', 'Is this the first document?']

In [48]:
cv = CountVectorizer(max_features=3)

In [49]:
cv_matrix = cv.fit_transform(text)
cv_array = cv_matrix.toarray()
df = pd.DataFrame(data=cv_array,columns = cv.get_feature_names())

In [50]:
df

Unnamed: 0,document,is,the
0,1,1,1
1,2,1,1
2,0,1,1
3,1,1,1


#### Binary

In [51]:
text = ['This is the first document. Is this the first document?']

In [52]:
cv = CountVectorizer(binary=True)
count_matrix = cv.fit_transform(text)
count_array = count_matrix.toarray()

In [53]:
df = pd.DataFrame(data=count_array,columns = cv.get_feature_names())
df

Unnamed: 0,document,first,is,the,this
0,1,1,1,1,1


#### Vocabulary

In [55]:
text = ['hello my name is james','Hello my name is James']

In [56]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(text)
print(cv.vocabulary_)

{'hello': 0, 'my': 3, 'name': 4, 'is': 1, 'james': 2}


In [57]:
print(cv.get_feature_names())

['hello', 'is', 'james', 'my', 'name']
