# Ham/Spam Classification

In [213]:
import pandas as pd
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
import numpy as np

In [214]:
df = pd.read_csv("example_131.csv",delimiter=",",names = ['label','message']) #Loading the data from example 13.1
df2 = pd.read_csv("example_1310.csv",delimiter=",",names = ['label','message']) #Loading the data from example 13.10
df3 = pd.read_csv("example_spam.csv",delimiter=",",names = ['label','message']) #Loading the data from example spam detection

In [215]:
values,values2,values3 = df.get_values(),df2.get_values(),df3.get_values()
print("Messages for the example 13.1")
print(df['message'])
print("\n Messages for the example 13.10")
print(df2['message'])
print("\n Messages for the example spam detection")
print(df3['message'])

Messages for the example 13.1
0                Chinese Beijing Chinese
1               Chinese Chinese Shanghai
2                          Chinese Macao
3                    Tokyo Japan Chinese
4    Chinese Chinese Chinese Tokyo Japan
Name: message, dtype: object

 Messages for the example 13.10
0            Taipei Taiwan
1    Macao Taiwan Shanghai
2            Japan Sapporo
3            Sapporo Osaka
4    Taiwan Taiwan Sapporo
Name: message, dtype: object

 Messages for the example spam detection
0    send us your password
1      send us your review
2     review your password
3                review us
4       send your password
5    send us your password
6            review us now
Name: message, dtype: object


#### Changing labels by numbers

In [216]:
#df['label'] = df.label.map({'0':0,'1':1})
values = df.get_values()
print("Values for the example 13.1")
print(values.shape)
print(values)
print("\n Values for the example 13.10")
print(values2.shape)
print(values2)
print("\n Values for the example spam detection")
print(values3.shape)
print(values3)

Values for the example 13.1
(5, 2)
[['China' 'Chinese Beijing Chinese']
 ['China' 'Chinese Chinese Shanghai']
 ['China' 'Chinese Macao']
 ['Not China' 'Tokyo Japan Chinese']
 ['?' 'Chinese Chinese Chinese Tokyo Japan']]

 Values for the example 13.10
(5, 2)
[['China' 'Taipei Taiwan']
 ['China' 'Macao Taiwan Shanghai']
 ['Not China' 'Japan Sapporo']
 ['Not China' 'Sapporo Osaka']
 ['?' 'Taiwan Taiwan Sapporo']]

 Values for the example spam detection
(7, 2)
[['spam' 'send us your password']
 ['ham' 'send us your review']
 ['ham' 'review your password']
 ['spam' 'review us']
 ['spam' 'send your password']
 ['spam' 'send us your password']
 ['?' 'review us now']]


#### convert all characters in the message to lower case

In [217]:
df['message'] = df['message'].str.lower()
df2['message'] = df2['message'].str.lower()
df3['message'] = df3['message'].str.lower()
print("Messages in lower-case for the example 13.1")
print(df['message'])
print("\n Messages in lower-case for the example 13.10")
print(df2['message'])
print("\n Messages in lower-case for the example spam detection")
print(df3['message'])

Messages in lower-case for the example 13.1
0                chinese beijing chinese
1               chinese chinese shanghai
2                          chinese macao
3                    tokyo japan chinese
4    chinese chinese chinese tokyo japan
Name: message, dtype: object

 Messages in lower-case for the example 13.10
0            taipei taiwan
1    macao taiwan shanghai
2            japan sapporo
3            sapporo osaka
4    taiwan taiwan sapporo
Name: message, dtype: object

 Messages in lower-case for the example spam detection
0    send us your password
1      send us your review
2     review your password
3                review us
4       send your password
5    send us your password
6            review us now
Name: message, dtype: object


#### Tokenizing the messages

First, we have to import and download the tokenizer from the console:
An installation window will appear. Go to the "Models" tab and select "punkt" from the "Identifier" column. Then click "Download" and it will install the necessary files. 

In [218]:
import nltk
#nltk.download()

Now we can apply the tokenization:

In [219]:
df['message'] = df['message'].apply(nltk.word_tokenize)
df2['message'] = df2['message'].apply(nltk.word_tokenize)
df3['message'] = df3['message'].apply(nltk.word_tokenize)
print("\n Messages after tokenization for the example 13.10")
print(df['message'])
print("\n Messages after tokenization for the example 13.1")
print(df2['message'])
print("\n Messages after tokenization for the example spam detection")
print(df3['message'])


 Messages after tokenization for the example 13.10
0                  [chinese, beijing, chinese]
1                 [chinese, chinese, shanghai]
2                             [chinese, macao]
3                      [tokyo, japan, chinese]
4    [chinese, chinese, chinese, tokyo, japan]
Name: message, dtype: object

 Messages after tokenization for the example 13.1
0             [taipei, taiwan]
1    [macao, taiwan, shanghai]
2             [japan, sapporo]
3             [sapporo, osaka]
4    [taiwan, taiwan, sapporo]
Name: message, dtype: object

 Messages after tokenization for the example spam detection
0    [send, us, your, password]
1      [send, us, your, review]
2      [review, your, password]
3                  [review, us]
4        [send, your, password]
5    [send, us, your, password]
6             [review, us, now]
Name: message, dtype: object


#### Stemming the messages using the Porter Stemmer algorithm

In [220]:
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])
df2['message'] = df2['message'].apply(lambda x: [stemmer.stem(y) for y in x])
df3['message'] = df3['message'].apply(lambda x: [stemmer.stem(y) for y in x])
print("Messages after stemming for the example 13.1")
print(df['message'])
print("\n Messages after stemming for the example 13.10")
print(df2['message'])
print("\n Messages after stemming for the example spam detection")
print(df3['message'])

Messages after stemming for the example 13.1
0                    [chines, beij, chines]
1                [chines, chines, shanghai]
2                           [chines, macao]
3                    [tokyo, japan, chines]
4    [chines, chines, chines, tokyo, japan]
Name: message, dtype: object

 Messages after stemming for the example 13.10
0             [taipei, taiwan]
1    [macao, taiwan, shanghai]
2             [japan, sapporo]
3             [sapporo, osaka]
4    [taiwan, taiwan, sapporo]
Name: message, dtype: object

 Messages after stemming for the example spam detection
0    [send, us, your, password]
1      [send, us, your, review]
2      [review, your, password]
3                  [review, us]
4        [send, your, password]
5    [send, us, your, password]
6             [review, us, now]
Name: message, dtype: object


#### Transforming data into occurrences

In [221]:
df['message'] = df['message'].apply(lambda x: ' '.join(x))
df2['message'] = df2['message'].apply(lambda x: ' '.join(x))
df3['message'] = df3['message'].apply(lambda x: ' '.join(x))
count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['message'])
counts2 = count_vect.fit_transform(df2['message'])
counts3 = count_vect.fit_transform(df3['message'])

print("Counts and messages for the example 13.1")
print(counts.toarray())
print(df['message'])
print("\n Counts and messages for the example 13.10")
print(counts2.toarray())
print(df2['message'])
print("\n Counts and messages for the example spam detection")
print(counts3.toarray())
print(df3['message'])
#print(count_vect.get_feature_names())

Counts and messages for the example 13.1
[[1 2 0 0 0 0]
 [0 2 0 0 1 0]
 [0 1 0 1 0 0]
 [0 1 1 0 0 1]
 [0 3 1 0 0 1]]
0                  chines beij chines
1              chines chines shanghai
2                        chines macao
3                  tokyo japan chines
4    chines chines chines tokyo japan
Name: message, dtype: object

 Counts and messages for the example 13.10
[[0 0 0 0 0 1 1]
 [0 1 0 0 1 0 1]
 [1 0 0 1 0 0 0]
 [0 0 1 1 0 0 0]
 [0 0 0 1 0 0 2]]
0            taipei taiwan
1    macao taiwan shanghai
2            japan sapporo
3            sapporo osaka
4    taiwan taiwan sapporo
Name: message, dtype: object

 Counts and messages for the example spam detection
[[0 1 0 1 1 1]
 [0 0 1 1 1 1]
 [0 1 1 0 0 1]
 [0 0 1 0 1 0]
 [0 1 0 1 0 1]
 [0 1 0 1 1 1]
 [1 0 1 0 1 0]]
0    send us your password
1      send us your review
2     review your password
3                review us
4       send your password
5    send us your password
6            review us now
Name: message, dtype: 

#### Using Term Frequency Inverse Document Frequency

In [222]:
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)
transformer2 = TfidfTransformer().fit(counts2)
counts2 = transformer2.transform(counts2)
transformer3 = TfidfTransformer().fit(counts3)
counts3 = transformer3.transform(counts3)

print("Counts after transforming for the example 13.1")
print(counts.toarray())
print("\n Counts after transforming for the example 13.10")
print(counts2.toarray())
print("\n Counts after transforming for the example spam detection")
print(counts3.toarray())

Counts after transforming for the example 13.1
[[0.72391022 0.68989419 0.         0.         0.         0.        ]
 [0.         0.68989419 0.         0.         0.72391022 0.        ]
 [0.         0.43016528 0.         0.90275015 0.         0.        ]
 [0.         0.38537163 0.65249088 0.         0.         0.65249088]
 [0.         0.78157088 0.44110484 0.         0.         0.44110484]]

 Counts after transforming for the example 13.10
[[0.         0.         0.         0.         0.         0.83088075
  0.55645052]
 [0.         0.63907044 0.         0.         0.63907044 0.
  0.42799292]
 [0.83088075 0.         0.         0.55645052 0.         0.
  0.        ]
 [0.         0.         0.83088075 0.55645052 0.         0.
  0.        ]
 [0.         0.         0.         0.4472136  0.         0.
  0.89442719]]

 Counts after transforming for the example spam detection
[[0.         0.53189578 0.         0.53189578 0.46592583 0.46592583]
 [0.         0.         0.53189578 0.53189578 0.46

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


## Training the model

#### Splitting data into training and test sets 

In [223]:
#x_train, x_test, y_train, y_test = train_test_split(counts, df['label'], test_size=.1)
x_train = counts[:4]
x_test = counts[4:5]
y_train = pd.Series(df.get_values()[0:4][:,0])
y_test = pd.Series(df.get_values()[4:5][:,0])

x_train2 = counts2[:4]
x_test2 = counts2[4:5]
y_train2 = pd.Series(df2.get_values()[0:4][:,0])
y_test2 = pd.Series(df2.get_values()[4:5][:,0])

x_train3 = counts3[:6]
x_test3 = counts3[6:7]
y_train3 = pd.Series(df3.get_values()[0:6][:,0])
y_test3 = pd.Series(df3.get_values()[6:7][:,0])



# Initializing Multinomial Bayes Classifier

In [224]:
model = MultinomialNB()

## Evaluating the model

In [225]:
model.fit(x_train, y_train)
predicted = model.predict(x_test)
print("Predicted class: " + str(predicted))
print("Probabilities for each class: " + str(model.predict_proba(x_test)))

model.fit(x_train2, y_train2)
predicted2 = model.predict(x_test2)
print("Predicted class: " + str(predicted2))
print("Probabilities for each class: " + str(model.predict_proba(x_test2)))

model.fit(x_train3, y_train3)
predicted3 = model.predict(x_test3)
print("Predicted class: " + str(predicted3))
print("Probabilities for each class: " + str(model.predict_proba(x_test3)))

Predicted class: ['China']
Probabilities for each class: [[0.67804117 0.32195883]]
Predicted class: ['China']
Probabilities for each class: [[0.55857543 0.44142457]]
Predicted class: ['spam']
Probabilities for each class: [[0.41671911 0.58328089]]
