### Naive Bayes classifier Exercise

In [1]:
docs = [
"Its close to midnight", 
"Something evil is lurking from the dark",
"Under the moonlight",
"You see a sight that almost stops your heart",
"You try to scream",
"But terror takes the sound before you make it",
"You start to freeze",
"As horror looks you right between your eyes",
"You are paralyzed",
        
"I could hardly believe it",
"When I heard the news today",
"I had to come and get it straight from you",
"They said you were leaving",
"Someone is swept your heart away",
"From the look upon your face I see its true",
"So tell me all about it",
"Tell me about the plans you are making",
"Then tell me one thing more before I go"]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pandas as pd
import numpy as np

In [3]:
# Instantiate CountVectorizer()
vect = CountVectorizer(stop_words='english')

# Count the word in the corpus
word_count = vect.fit_transform(docs)

In [4]:
# Instantiate CountVectorizer()
vect 

CountVectorizer(stop_words='english')

In [5]:
# let's check the shape
word_count.shape # <-- Sparse Matrix in Compressed Sparse Row format

(18, 40)

### Compute IDF (Inverse Document Frequency)

In [6]:
# Instantiate TfidfTransformer()
tfidf_tran=TfidfTransformer()

# Fit tfidf_tran
tfidf_tran.fit(word_count)

# print idf values
idf_df = pd.DataFrame(tfidf_tran.idf_, index=vect.get_feature_names(),columns=["idf_weights"])
idf_df.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
tell,2.558145
heart,2.845827
away,3.251292
paralyzed,3.251292
plans,3.251292
right,3.251292
said,3.251292
scream,3.251292
sight,3.251292
sound,3.251292


### Compute the TFIDF score for the corpus

In [7]:
# transform tfidf_tran --> term frequency inverse document frequency
tfidf_mat=tfidf_tran.transform(word_count)

In [8]:
tfidf_mat.shape

(18, 40)

In [9]:
# tfidf_mat --> dense matrix
tfidf_mat = tfidf_mat.todense()

In [10]:
# tf_idf_mat --> dataframe
indexs = ['doc1','doc2','doc3','doc4','doc5','doc6','doc7','doc8','doc9',
          'doc10', 'doc11','doc12','doc13','doc14','doc15','doc16','doc17', 'doc18' ]
tfid_df = pd.DataFrame(tfidf_mat,columns=vect.get_feature_names(),index=indexs)

In [11]:
tfid_df

Unnamed: 0,away,believe,close,come,dark,evil,eyes,face,freeze,hardly,...,stops,straight,swept,takes,tell,terror,thing,today,true,try
doc1,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc2,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.601261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107
doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0
doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc8,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc10,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Naive Bayes in Sklearn

In [12]:
from sklearn.naive_bayes import  MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
# Artist variable
artist = ['Jackson']*9 + ['Bolton']*9
artist
y = np.array(artist)

In [14]:
# Features variable
X = tfidf_mat

In [15]:
# train and test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.20,random_state=200)

In [16]:
# Instantiate the model
model = MultinomialNB(alpha=.8)
# fit the model
model.fit(X_train,y_train)

MultinomialNB(alpha=0.8)

In [17]:
# predict
y_pred = model.predict(X_test)

In [18]:
print(f'Accuracy test: {accuracy_score(y_test,y_pred):.2}')

Accuracy test: 0.75


#### What if I want to predict another doc?

In [19]:
new_doc = ['It is just a dream of mine is coming to an end']
new_doc_vect = vect.transform(new_doc).todense() # <-- use only transform
new_doc_tfidf = tfidf_tran.transform(new_doc_vect).todense() # <-- use only transform

In [20]:
model.predict(new_doc_tfidf)

array(['Bolton'], dtype='<U7')

In [21]:
new_doc = ['The evil of the thriller']
new_doc_vect = vect.transform(new_doc).todense() # <-- use only transform
new_doc_tfidf = tfidf_tran.transform(new_doc_vect).todense() # <-- use only transform

In [22]:
model.predict(new_doc_tfidf)

array(['Jackson'], dtype='<U7')