In [1]:
import pandas as pd

import seaborn as sb
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv('../datasets/emails.csv', sep=',', names=["label", "text"]) 

In [3]:
df.shape

(500, 2)

In [4]:
df.head()

Unnamed: 0,label,text
0,good,fubarbell fubarbell home blog videos events se...
1,good,rygordonlaw com home search sitemap toll free ...
2,good,1 geek media providing seo search engine optim...
3,good,please log in capitol luggage welcome to capit...
4,good,pipl - people search search by name email user...


In [5]:
## lets see how many "good" and "bad" emails we have
df.groupby(["label"]).count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
bad,180
good,320


In [6]:
# simple example of CountVectorizer. Try to understand whats going on here.
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(["Hello good day", "Good day to you"])
print (cv.vocabulary_)
pd.DataFrame(X.todense(), columns=['day', 'good', 'hello', 'to', 'you'])

{'day': 0, 'good': 1, 'hello': 2, 'you': 4, 'to': 3}


Unnamed: 0,day,good,hello,to,you
0,1,1,1,0,0
1,1,1,0,1,1


In [7]:
cv.transform(["day"]).toarray()

array([[1, 0, 0, 0, 0]])

In [143]:

#from sklearn.feature_extraction.text import CountVectorizer
#count_vect = CountVectorizer()
#X_train_counts = count_vect.fit_transform(df.text)
#X_train_counts.shape

(500, 11351)

In [8]:
## lets train a model that predicts if an email is good or bad
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB

vectorizer = CountVectorizer()
text_vectorized = vectorizer.fit_transform(df.text)
text_vectorized_array = text_vectorized.toarray()
gnb = GaussianNB()
X_train, X_test, y_train, y_test = train_test_split(text_vectorized_array, df.label)
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print("Number of mislabeled email out of a total %d test emails : %d"
      % (X_test.shape[0],(y_test != y_pred).sum()))

Number of mislabeled email out of a total 125 test emails : 13


In [9]:
## Predicitng a single email 
email_text = ["This is a new emaiil text I wonder if the model is going to classify it as good or bad"]

## Notice that we only use transform, not fit_transform! We must convert the text to the same features
## that we used to fit our model. Also note that new words in email_text (that didn't appear in email.csv)
## would be ignored. 
email_text_vectorized = vectorizer.transform(email_text) ## creates a single row with 11351 features
gnb.predict(email_text_vectorized.toarray())

array(['good'], 
      dtype='<U4')

In [10]:
## In our corpus there are 11351 words. email_text_vectorized is a list with 11351 elements, each element 
## is either 1 or 0, if the word appear in email_text or not. We expact to have max 20 1's, because 
## thats the length of email_text. When we sum the array we get 15, meaninng there are 15 words in email_text
## that also appear in our corpus.
sum (email_text_vectorized.toarray()[0])

15

## Using TfidfVectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidVectorizer = TfidfVectorizer()
text_vectorized = tfidVectorizer.fit_transform(df.text)
text_vectorized_array = text_vectorized.toarray()

In [11]:
gnb = GaussianNB()
X_train, X_test, y_train, y_test = train_test_split(text_vectorized_array, df.label)

In [12]:
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print("Number of mislabeled email out of a total %d test emails : %d"
      % (X_test.shape[0],(y_test != y_pred).sum()))


Number of mislabeled email out of a total 125 test emails : 12
