<strong>Four (4) Steps of Text analysis</strong>

<li>Import
<li>Instantiate
<li>Fit or train
<li>Transform

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [25]:
# sms messages
simple_train = ['call you tonight', 'Call me a cab', 'please call me...', 'PLEASE!']

In [26]:
# initialise CountVectorizer
vect = CountVectorizer()

# learn the 'vocabulary of the training data simple_train
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [27]:
# examine the ftted vocabulary
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [28]:
# transform training data into a 'document-term matrix(dtm)'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<4x6 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

<strong>Note: </strong>4x6 matrix means 4 documents (or rows/records) and 6 terms (or columns/features)

In [29]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 1, 0, 0],
       [0, 0, 0, 1, 0, 0]])

In [30]:
# examine the vocabulary and documeny-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,1,0,0
3,0,0,0,1,0,0


In [31]:
simple_train

['call you tonight', 'Call me a cab', 'please call me...', 'PLEASE!']

In [32]:
# sparse matrix only stores the coordinates of non-zero values and the values
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	1
  (3, 3)	1


In [33]:
# example text or model testing
simple_test = ["please don't call me"]

In [34]:
# trabsform testing data into a dtm
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]])

In [35]:
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


#### Reading a text-based datatset into pandas

In [36]:
sms = pd.read_table('../dataBank/sms.tsv', header=None, names=['label', 'message'])
sms.shape

(5572, 2)

In [37]:
sms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null object
message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [38]:
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [39]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [40]:
# convert to a numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [41]:
sms.head(10)

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [42]:
# define X and y from sms for use with CountVectorizer
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [45]:
# split X and y into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [48]:
# instantiate the vectorizer
vect = CountVectorizer()

# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.fit(X_train).transform(X_train)
X_train_dtm

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [50]:
# transform testing data using fitted vocabulary
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

### Model building using multinomial Naive Bayes

Multinomial Naive Bayes is suitable for classification with <strong>discrete features</strong> (e.g., word counts for text classification). the multinomial distribution normally requires integer features counts. However, in practice, fractional counts such as tf-idf may also work.

In [51]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [52]:
%time nb.fit(X_train_dtm, y_train)

CPU times: user 4.49 ms, sys: 1.7 ms, total: 6.2 ms
Wall time: 6.08 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [53]:
y_pred_class = nb.predict(X_test_dtm)

In [54]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9885139985642498

In [55]:
# print confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]])

In [57]:
# print message text for the false positives (ham incorrectly classified as spam)
X_test[y_pred_class > y_test]

574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [58]:
# print message text for false negatives (spam incorrectly classified as ham)
X_test[y_pred_class < y_test]

3132    LookAtMe!: Thanks for your purchase of a video...
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1875    Would you like to see my XXX pics they are so ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
2821    INTERFLORA - It's not too late to order Inter...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
Name: message, dtype: object