# All words from docs represented as tf-idf vectors used as features

In [1]:
import os
import nltk
import pathlib
import math
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
np.random.seed(42)

# Reading Filenames and Labels

In [2]:
directory=os.getcwd()+'\\fulltext'+'\course'
# print(directory)
data={}
for text_file in os.listdir(directory):
    data[text_file]=1
print("Total number of course docs: ",len(os.listdir(directory)))

directory=os.getcwd()+'\\fulltext'+'\\non-course'
# print(directory)
for text_file in os.listdir(directory):
    data[text_file]=0
print("Total number of non-course docs: ",len(os.listdir(directory)))
print("Total docs: ",len(data))

Total number of course docs:  230
Total number of non-course docs:  821
Total docs:  1051


In [3]:
{k: data[k] for k in list(data)[:5]}

{'http_^^cs.cornell.edu^Info^Courses^Current^CS415^CS414.html': 1,
 'http_^^cs.cornell.edu^Info^Courses^Fall-95^CS415^CS415.html': 1,
 'http_^^cs.cornell.edu^Info^Courses^Spring-96^CS432^cs432.html': 1,
 'http_^^simon.cs.cornell.edu^Info^Courses^Current^CS401^': 1,
 'http_^^simon.cs.cornell.edu^Info^Courses^Spring-96^CS515^': 1}

In [4]:
doc_titles = []
doc_labels = []
for key,value in data.items():
    doc_titles.append(key)
    doc_labels.append(value)

# Splitting Dataset into 85:15 ratio

In [5]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(doc_titles,doc_labels,test_size=0.15,stratify=doc_labels,shuffle=True)
print ('Total number of Training Data :',len(xtrain))
print ('Total number of Test Data :',len(xtest))

Total number of Training Data : 893
Total number of Test Data : 158


In [6]:
xtrain[:5]

['http_^^www.cs.wisc.edu^~krung^krung.html',
 'http_^^www.cs.wisc.edu^~milo^milo.html',
 'http_^^www.cs.washington.edu^education^courses^cse567',
 'http_^^www.cs.cornell.edu^Info^People^scl^sean.html',
 'http_^^www.cs.utexas.edu^users^emery^']

In [7]:
ytrain[:5]

[0, 0, 1, 0, 0]

In [8]:
print ('Total number of Training Data :',len(xtrain))
print ('Total number of Test Data :',len(xtest))

Total number of Training Data : 893
Total number of Test Data : 158


# Reading Documents data and cleaning it

In [9]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hasan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hasan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Hasan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
def Pre_Processing(word):
    word=case_folding(word)
    word=remove_punctuation(word)
    word=lemmatization(word)
    return(word)

def lemmatization(wor):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer() 
    return(lemmatizer.lemmatize(wor))

def case_folding(wo):
    return(wo.lower())

def remove_punctuation(w):  
    import re
    w=re.sub('(\d)',"",w)
    w=w.replace("—","")
    w=w.replace("_","")
    w=w.replace("?","")
    w=w.replace(".","")
    w=w.replace("`","")
    w=w.replace(",","")
    w=w.replace("[","")
    w=w.replace("]","")
    w=w.replace("â€”","")
    w=w.replace(":","")
    w=w.replace("|","")
    w=w.replace(";","")
    w=w.replace("{","")
    w=w.replace("}","")
    w=w.replace("-","")
    w=w.replace("=","")
    w=w.replace("…","")
    w=w.replace("Â","")
    w=w.replace("/","")
    w=w.replace("â","")
    w=w.replace("'","")
    w=w.replace("–","")
    w=w.replace('"',"")
    w=w.replace("$","")
    w=w.replace("â–","")
    w=w.replace("%","")
    w=w.replace("(","")
    w=w.replace("&","")
    w=w.replace(")","")
    w=w.replace("ã©","")
    w=w.replace("!","")
    return(w)

In [11]:
def read_doc_data(doc_name,doc_label,corpus):
    directory=os.getcwd()
    i=0
    doc_data={}
    for doc,label in zip(doc_name,doc_label):
        wordlist=[]
        dir1=directory
        if label==1:
            dir1=directory+"\\fulltext\\course"
        if label==0:
            dir1=directory+"\\fulltext\\non-course"
        f=open(dir1+"\\"+doc,'r')
        text=f.read()
        soup = BeautifulSoup(text,features="lxml")
        lines=soup.get_text()
        lines=lines.replace("\n"," ")
        lines=lines.replace("("," ")
        lines=lines.replace(")"," ")
        lines=lines.replace(":"," ")
        lines=lines.replace(","," ")
        iii="  "
        for iii in lines:
            lines=lines.replace("  "," ")

        wordlist=lines.split(" ")
        wordlist2=[]
        for word in wordlist:
            if word not in stopword:
                word=Pre_Processing(word)
                if word=='' or word==' 'or word=='  ':
                    continue
                word=word.strip()
                if len(word)<=2:
                    continue
                wordlist2.append(word)
                if word not in corpus:
                    corpus.append(word)
        doc_data[doc]=wordlist2 
    return doc_data,corpus

In [12]:
xtrain_data, corpus = read_doc_data(xtrain,ytrain,[])
xtest_data, corpus = read_doc_data(xtest,ytest,corpus)

In [13]:
{k: xtrain_data[k] for k in list(xtrain_data)[:2]}

{'http_^^www.cs.wisc.edu^~krung^krung.html': ['krungs',
  'homepage',
  'updated',
  'november',
  'krungs',
  'homepage',
  'underconstruction',
  'try',
  'keep',
  'page',
  'short',
  'informative',
  'have',
  'good',
  'serf',
  'the',
  'year',
  'come',
  'the',
  'following',
  'web',
  'related',
  'topic',
  'research',
  'mathematical',
  'programming',
  'project',
  'pursuing',
  'course',
  'work',
  'old',
  'course',
  'work',
  'computer',
  'science',
  'department',
  'computer',
  'company',
  'favorite',
  'hobby',
  'personal',
  'information',
  'personal',
  'opinion',
  'life',
  'madisonwisconsin',
  'linked',
  'the',
  'following',
  'web',
  'page',
  'important',
  'link',
  'university',
  'madisonwisconsin',
  'whole',
  'computer',
  'science',
  'department',
  'unique',
  'entity',
  'electronic',
  'library',
  'system',
  'krung',
  'sinapiromsaran',
  'email',
  'krung@cswiscedu'],
 'http_^^www.cs.wisc.edu^~milo^milo.html': ['milo',
  'martin',
  

# Saving Data in files

In [14]:
def save_doc_data(data,filename):
    f=open(filename,'w')
    for doc in data:
        f.write(doc)
        f.write(":")
        for word in data[doc]:
            f.write(word)
            f.write(",")
        f.write("\n")
    f.close()
    
def save_corpus():
    f=open("corpus.txt",'w')
    for word in corpus:
        f.write(word)
        f.write("\n")
    f.close()

In [15]:
save_doc_data(xtrain_data,"train_doc_data.txt")
save_doc_data(xtest_data,"test_doc_data.txt")
save_corpus()

# Reading Data from Saved files(if needed)

In [16]:
def read_data_from_file():
    xtrain_data={}
    xtest_data={}
    f=open("train_doc_data.txt",'r')
    text=f.read()
    text=text.split('\n')
    for line in text:
        line=line.split(":")
        line1=line[1]
        line1=line1.removesuffix(",")
        line1=line1.split(',')
        xtrain_data[line[0]]=line1
    f.close()
    f=open("test_doc_data.txt",'r')
    text=f.read()
    text=text.split('\n')
    for line in text:
        line=line.split(":")
        line1=line[1]
        line1=line1.removesuffix(",")
        line1=line1.split(',')
        xtest_data[line[0]]=line1
    f.close()
    return xtrain_data, xtest_data

def read_corpus():
    corpus=[]
    f=open("corpus.txt",'r')
    text=f.read()
    text=text.split('\n')
    for word in text:
        if word!='' or word!=' 'or word!='  ':
            corpus.append(word)
    f.close()
    return corpus
    
# xtrain_data, xtest_data = read_data_from_file()
# corpus = read_corpus()

# Displaying Cleaned Text

In [17]:
xtrain_text=[]
for doc in xtrain_data:
    tmp=' '.join([item for item in xtrain_data[doc]])
    xtrain_text.append(tmp)

In [18]:
xtrain_text[:2]

['krungs homepage updated november krungs homepage underconstruction try keep page short informative have good serf the year come the following web related topic research mathematical programming project pursuing course work old course work computer science department computer company favorite hobby personal information personal opinion life madisonwisconsin linked the following web page important link university madisonwisconsin whole computer science department unique entity electronic library system krung sinapiromsaran email krung@cswiscedu',
 'milo martin home page milo martin milo@cswiscedu graduate student teaching assistant computer science department university wisconsinmadison west dayton street madison usa email milo@cswiscedu office csst office phone office hour tuesdaythursday appointment computer science gustavus adolphus college class compiler construction charles fischer advanced computer architecture mark hill java sitting mark hill james larus teaching algebraic langu

# Generating Doc index for faster access

In [19]:
# For Training
train_doc_index={}
for i in range(len(xtrain)):
    train_doc_index[xtrain[i]]=i
train_doc_index_invert={}
for i in range(len(xtrain)):
    train_doc_index_invert[i]=xtrain[i]
    
# For Testing
test_doc_index={}
for i in range(len(xtest)):
    test_doc_index[xtest[i]]=i
test_doc_index_invert={}
for i in range(len(xtest)):
    test_doc_index_invert[i]=xtest[i]

# Method 1: Using all words in the document

# Generating a dictionary to know which word is coming in which document how many times

In [20]:
len(xtrain_data)

893

There are 893 docs in training data so for each word there will be a list of length 893. Each number from 0-892 will represent a doc.

In [21]:
def generate_index(data): 
    Index={}
    i=0
    for doc in data:
        for word in data[doc]:
            if word not in Index:
                Index[word]=[]
                Index[word]=[0]*(len(data))
            Index[word][i]=Index[word][i]+1
        i=i+1
    return Index

In [22]:
Training_Index = generate_index(xtrain_data)

# Calculating tf-idf scores

In [23]:
def tf_idf_score(Index, data):
    idf_scores={}
    for word in Index:
        df=0
        for i in range(len(data)):
            if int(Index[word][i])>0:
                df+=1
        idf=math.log10(len(data)/df)
        idf_scores[word]=idf
    return idf_scores

In [24]:
idf_scores = tf_idf_score(Training_Index, xtrain)

In [25]:
{k: idf_scores[k] for k in list(idf_scores)[:3]}

{'krungs': 2.9508514588885464,
 'homepage': 0.9966089494492215,
 'updated': 0.843641489240678}

In [26]:
def tf_idf_docs(Index, idf_scores, data):
    doc_dict={}
    for i in range(len(data)):
        D=[]
        for word in Index:
            if int(Index[word][i]) ==0 or float(idf_scores[word])==0:
                D.append(0)
                continue
            D.append(int(Index[word][i])*float(idf_scores[word]))
        doc_dict[i]=D
    return doc_dict

In [27]:
train_tf_idf_docs_dict = tf_idf_docs(Training_Index, idf_scores, xtrain)
# Since we'll only use the training index(words appear in training data) and ignore the words that 
# are not in training data and will directly convert testing docs to tf-idf vectors based on training data
# test_tf_idf_docs_dict = tf_idf_docs(Training_Index, training_idf_scores, xtest)

Documents are vectorized on the basis of term-frequency-inverse-doc-frequency scoring

# Preparing Data for training

In [28]:
def to_doc_vectors(tf_idf_docs_dict):
    doc_vectors=[]
    i=0
    for doc in tf_idf_docs_dict:
        doc_vectors.append([])
        doc_vectors[i]=tf_idf_docs_dict[doc]
        i+=1
    return doc_vectors

In [29]:
train_tf_idf_doc_vectors = to_doc_vectors(train_tf_idf_docs_dict)
# test_tf_idf_doc_vectors = to_doc_vectors(test_tf_idf_docs_dict)

Data is ready to be fed to the model

# Importing Naive Bayes model and fitting the data

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB().fit(train_tf_idf_doc_vectors,ytrain)

Data has been fitted

# Preparing Testing data

In [31]:
Testing_Index = generate_index(xtest_data)

In [32]:
word_index={}
i=0
for word in idf_scores:
    word_index[word]=i
    i=i+1
test_tf_idf_doc_vectors=[]
for doc in xtest_data:
    D=[0]*(len(idf_scores))
    for word in xtest_data[doc]:
        if word in Training_Index:
            D[word_index[word]]=int(Testing_Index[word][test_doc_index[doc]])*float(idf_scores[word])
    test_tf_idf_doc_vectors.append(D)

In [33]:
predicted = clf.predict(test_tf_idf_doc_vectors)

In [34]:
print(predicted)

[0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1
 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 0 0 1 0 0]


# 1=course   0=Non-course

In [35]:
print('\n Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted))
print('\n Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('\n The value of Precision', metrics.precision_score(ytest,predicted))
print('\n The value of Recall', metrics.recall_score(ytest,predicted))


 Accuracy of the classifier is 0.930379746835443

 Confusion matrix
[[117   6]
 [  5  30]]

 The value of Precision 0.8333333333333334

 The value of Recall 0.8571428571428571


# Method 2: Using only Nouns from the dataset

In [36]:
# Library to identify the type of the word i.e [noun pronoun,verb....]

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hasan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Preparing Training Data

In [37]:
# Identifying tpe of each word in a doc
nouns_xtrain_data={}
for doc in xtrain_data:
    nouns_xtrain_data[doc]=nltk.pos_tag(xtrain_data[doc])
    
# Extracting only Nouns from the doc, dropping all other type of words.
for doc in xtrain_data:
    nouns_xtrain_data[doc] = [word for word,pos in nouns_xtrain_data[doc] if pos == 'NN']

In [38]:
# Generating Index
Noun_Training_Index = generate_index(nouns_xtrain_data)

In [39]:
# Generating Nouns IDF-scores
Noun_idf_scores = tf_idf_score(Noun_Training_Index, xtrain)

In [40]:
{k: Noun_idf_scores[k] for k in list(Noun_idf_scores)[:2]}

{'homepage': 1.0370376065048297, 'underconstruction': 2.9508514588885464}

In [41]:
nouns_train_tf_idf_docs_dict = tf_idf_docs(Noun_Training_Index, Noun_idf_scores, xtrain)

In [42]:
nouns_train_tf_idf_doc_vectors = to_doc_vectors(nouns_train_tf_idf_docs_dict)

# Preparing Testing Data

In [43]:
# Identifying tpe of each word in a doc
nouns_xtest_data={}
for doc in xtest_data:
    nouns_xtest_data[doc]=nltk.pos_tag(xtest_data[doc])
    
# Extracting only Nouns from the doc, dropping all other type of words.
for doc in xtest_data:
    nouns_xtest_data[doc] = [word for word,pos in nouns_xtest_data[doc] if pos == 'NN']

In [44]:
Noun_Testing_Index = generate_index(nouns_xtest_data)

In [45]:
word_index={}
i=0
for word in Noun_idf_scores:
    word_index[word]=i
    i=i+1
nouns_test_tf_idf_doc_vectors=[]
for doc in nouns_xtest_data:
    D=[0]*(len(Noun_idf_scores))
    for word in nouns_xtest_data[doc]:
        if word in Noun_Training_Index:
            D[word_index[word]]=int(Noun_Testing_Index[word][test_doc_index[doc]])*float(Noun_idf_scores[word])
    nouns_test_tf_idf_doc_vectors.append(D)

# Fitting Same Model and Testing

In [46]:
clf = MultinomialNB().fit(nouns_train_tf_idf_doc_vectors,ytrain)

In [47]:
predicted = clf.predict(nouns_test_tf_idf_doc_vectors)

In [48]:
predicted

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0])

In [49]:
print('\n Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted))
print('\n Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('\n The value of Precision', metrics.precision_score(ytest,predicted))
print('\n The value of Recall', metrics.recall_score(ytest,predicted))


 Accuracy of the classifier is 0.9113924050632911

 Confusion matrix
[[114   9]
 [  5  30]]

 The value of Precision 0.7692307692307693

 The value of Recall 0.8571428571428571


# Method 3: Using only Top Frequent terms in each class
Extracting top 50 nouns from each class which appears in most number of documents in each class

In [50]:
def count_words_in_class(data,class_name): 
    Index={}
    for doc in data:
        if doc in class_name:
            for word in data[doc]:
                if word not in stopword:
                    if word not in Index:
                        Index[word]=0
                    Index[word]=Index[word]+1
    return Index

In [51]:
# Seperating both classes documents to retrive most frequent words from each class
course_class_train = [doc for doc,doc_label in zip(xtrain,ytrain) if doc_label==1]
non_course_class_train = [doc for doc,doc_label in zip(xtrain,ytrain) if doc_label==0]
# course_class_test = [doc for doc,doc_label in zip(xtest,ytest) if doc_label==1]
# non_course_class_test = [doc for doc,doc_label in zip(xtest,ytest) if doc_label==0]

In [53]:
# Removing duplicate words in a doc
xtrain_data_unique={}
for doc in xtrain_data:
    xtrain_data_unique[doc] = list(set(xtrain_data[doc]))

In [54]:
# Extracting top words from course docs
course_words = count_words_in_class(xtrain_data_unique,course_class_train)
sorted_course_words = dict(sorted(course_words.items(), key=lambda item: item[1],reverse=True))
top_50_course_words = {k: sorted_course_words[k] for k in list(sorted_course_words)[:100]}

In [55]:
# Extracting top words from non-course docs
non_course_words = count_words_in_class(xtrain_data_unique,non_course_class_train)
sorted_non_course_words = dict(sorted(non_course_words.items(), key=lambda item: item[1],reverse=True))
top_50_non_course_words = {k: sorted_non_course_words[k] for k in list(sorted_non_course_words)[:100]}

In [56]:
# Combining both sets to form a corpus which will be our features
unique_course_words = [word for word in top_50_course_words if word not in top_50_non_course_words]
unique_non_course_words = [word for word in top_50_non_course_words if word not in top_50_course_words]
all_words = unique_course_words + unique_non_course_words

In [57]:
# Total number of words(features)
len(all_words)

116

# Preparing Training Data

In [58]:
def generate_noun_index(data): 
    Index={}
    i=0
    for doc in data:
        for word in data[doc]:
            if word in all_words:
                if word not in Index:
                    Index[word]=[]
                    Index[word]=[0]*(len(data))
                Index[word][i]=Index[word][i]+1
        i=i+1
    return Index

In [59]:
Top_words_training_index = generate_noun_index(xtrain_data)

In [60]:
# Generating Nouns IDF-scores
Top_words_idf_scores = tf_idf_score(Top_words_training_index, xtrain)

In [62]:
{k: Top_words_idf_scores[k] for k in list(Top_words_idf_scores)[:2]}

{'homepage': 0.9966089494492215, 'updated': 0.843641489240678}

In [63]:
top_words_train_tf_idf_docs_dict = tf_idf_docs(Top_words_training_index, Top_words_idf_scores, xtrain)

In [64]:
top_words_train_tf_idf_doc_vectors = to_doc_vectors(top_words_train_tf_idf_docs_dict)

# Preparing Testing Data

In [65]:
Top_words_testing_Index = generate_noun_index(xtest_data)

In [67]:
word_index={}
i=0
for word in Top_words_idf_scores:
    word_index[word]=i
    i=i+1
top_words_test_tf_idf_doc_vectors=[]
for doc in xtest_data:
    D=[0]*(len(Top_words_idf_scores))
    for word in xtest_data[doc]:
        if word in Top_words_training_index:
            D[word_index[word]]=int(Top_words_training_index[word][test_doc_index[doc]])*float(Top_words_idf_scores[word])
    top_words_test_tf_idf_doc_vectors.append(D)

# Fitting Same Model and Testing

In [70]:
clf = MultinomialNB().fit(top_words_train_tf_idf_doc_vectors,ytrain)
predicted = clf.predict(top_words_test_tf_idf_doc_vectors)

In [71]:
predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [72]:
print('\n Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted))
print('\n Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('\n The value of Precision', metrics.precision_score(ytest,predicted))
print('\n The value of Recall', metrics.recall_score(ytest,predicted))


 Accuracy of the classifier is 0.810126582278481

 Confusion matrix
[[118   5]
 [ 25  10]]

 The value of Precision 0.6666666666666666

 The value of Recall 0.2857142857142857


# Conclusion
Using all the data as features used in method 1 seems to be the best approach with this dataset using Naive Bayes Algorithm