# Text Classification using Naive Bayes and Support Vector Machine

## 0. Add Libraries and import dependencies

In [1]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

from os import getcwd, path

# Reference
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34
# https://neptune.ai/blog/text-classification-tips-and-tricks-kaggle-competitions

In [2]:
# NLTK API 
# https://tedboy.github.io/nlps/api_nltk.html

# Run only for the first time#
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

In [3]:
# To keep results reproducibility
np.random.seed(500)

## Parameters

In [4]:
sent_tokenizer = False

## Read in Input data

In [36]:
data_dir = path.join(getcwd(), "data/corpus.csv")
Corpus = pd.read_csv(data_dir ,encoding='latin-1')

In [6]:
Corpus.head(10)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,__label__2
1,The best soundtrack ever to anything.: I'm re...,__label__2
2,Amazing!: This soundtrack is my favorite musi...,__label__2
3,Excellent Soundtrack: I truly like this sound...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2
5,an absolute masterpiece: I am quite sure any ...,__label__2
6,"Buyer beware: This is a self-published book, ...",__label__1
7,Glorious story: I loved Whisper of the wicked...,__label__2
8,A FIVE STAR BOOK: I just finished reading Whi...,__label__2
9,Whispers of the Wicked Saints: This was a eas...,__label__2


## 1. Data preprocessing

- I. Remove Blank rows in Data, if any
- II. Change all the text to lower case
- III. Word Tokenization
- IV. Remove Stop words
- V. Remove Non-alpha text
- VI. Word Lemmatization

In [7]:
sent_tokenize(Corpus['text'][0])

[' Stuning even for the non-gamer: This sound track was beautiful!',
 'It paints the senery in your mind so well I would recomend it even to people who hate video game music!',
 'I have played the game Chrono Cross but out of all of the games I have ever played it has the best music!',
 'It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras.',
 'It would impress anyone who cares to listen!',
 '^_^']

In [8]:
# Step I : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)

# Step II : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
# Corpus['text'] = Corpus['text'].apply(lambda x: str(x).lower())
Corpus['text'] = [x.lower() for x in Corpus['text']]

# Step III : Tokenization : In this each entry in the corpus will be broken into set of words
if sent_tokenizer: 
    Corpus['text'] = [sent_tokenize(x) for x in Corpus['text']] 
else:
    #Corpus['text'] = Corpus['text'].apply(lambda x: str(word_tokenize(x)) )
    Corpus['text'] = [word_tokenize(x) for x in Corpus['text']]

In [9]:
# Step IV, V, VI : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
# Word Classification for Lemmatizer https://www.nltk.org/_modules/nltk/corpus/reader/wordnet.html
# https://www.geeksforgeeks.org/defaultdict-in-python/
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [10]:
tag_map

defaultdict(<function __main__.<lambda>()>, {'J': 'a', 'V': 'v', 'R': 'r'})

<center>
<img src="img/tagset.jpg" aligment ='center' width="400" height="200">

### Note:  NLTK Word tagging is based on Penn Treebank Postagging 
### https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

## Example of Word Tagging

In [11]:
# entry = Corpus['text'][0]
# pos_tag(entry)  

## Excute word tagging

In [12]:
for index, entry in enumerate(Corpus['text']):
    
    # Declaring Empty List to store the words that follow the rules for this step
    lemma_words = []
    
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    # Posttagging reference : https://www.nltk.org/book/ch05.html 
        
    for word, tag in pos_tag(entry):
        
        # Below condition is to check for Stop words and consider only alphabets
        # List of stop words https://gist.github.com/sebleier/554280, https://www.nltk.org/book/ch02.html
        
        # NLTK check for an alphabetic word https://tedboy.github.io/nlps/generated/generated/nltk.text_type.isalpha.html
        if word not in stopwords.words('english') and word.isalpha():
            
            # Reference https://www.geeksforgeeks.org/python-lemmatization-with-nltk/
            # Use first letter of NLTK Postagging as "pos" parameter mapping it through the dict tag_map
            lemma_word = word_Lemmatized.lemmatize(word = word,
                                                   pos = tag_map[tag[0]]  )
            # Append word back to the empty list
            lemma_words.append(lemma_word)
            
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(lemma_words)

In [13]:
Corpus.head()

Unnamed: 0,text,label,text_final
0,"[stuning, even, for, the, non-gamer, :, this, ...",__label__2,"['stun', 'even', 'sound', 'track', 'beautiful'..."
1,"[the, best, soundtrack, ever, to, anything, .,...",__label__2,"['best', 'soundtrack', 'ever', 'anything', 're..."
2,"[amazing, !, :, this, soundtrack, is, my, favo...",__label__2,"['amaze', 'soundtrack', 'favorite', 'music', '..."
3,"[excellent, soundtrack, :, i, truly, like, thi...",__label__2,"['excellent', 'soundtrack', 'truly', 'like', '..."
4,"[remember, ,, pull, your, jaw, off, the, floor...",__label__2,"['remember', 'pull', 'jaw', 'floor', 'hear', '..."


## 2. Prepare Train and Test Data set

In [22]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split( Corpus['text_final'], Corpus['label'], test_size=0.3)

## 3. Label Encoder

In [23]:
Train_Y

2150    __label__2 
8629    __label__2 
9048    __label__1 
7060    __label__2 
2735    __label__2 
           ...     
1531    __label__1 
8879    __label__2 
1242    __label__2 
3720    __label__1 
6341    __label__2 
Name: label, Length: 7000, dtype: object

In [24]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html?highlight=labelencoder
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [25]:
Encoder.classes_

array(['__label__1 ', '__label__2 '], dtype=object)

In [26]:
Train_Y

array([1, 1, 0, ..., 1, 0, 1])

## 4. Word Vectorization

It is a general process of turning a collection of text documents into numerical feature vectors.
The most popular method is called TF-IDF. “Term Frequency — Inverse Document Frequency" which are the components of the resulting scores assigned to each word.

- Term Frequency: This summarizes how often a given word appears within a document.
- Inverse Document Frequency: This down scales words that appear a lot across documents.

Reference: https://en.wikipedia.org/wiki/Tf%E2%80%93idf

How TFIDF is calculated ? https://medium.com/@cmukesh8688/tf-idf-vectorizer-scikit-learn-dbc0244a911a

In [27]:
Train_X

2150    ['read', 'discover', 'need', 'use', 'think', '...
8629    ['pocket', 'book', 'guide', 'lao', 'excellent'...
9048    ['hop', 'simple', 'game', 'work', 'exactly', '...
7060    ['difficult', 'parent', 'quality', 'text', 'go...
2735    ['threesome', 'tank', 'suppose', 'victimize', ...
                              ...                        
1531    ['guess', 'miss', 'cool', 'buy', 'thinking', '...
8879    ['essential', 'cd', 'mixer', 'double', 'volume...
1242    ['cool', 'tool', 'playtime', 'fun', 'daughter'...
3720    ['nasty', 'book', 'nasty', 'woman', 'woman', '...
6341    ['fun', 'game', 'lot', 'way', 'show', 'age', '...
Name: text_final, Length: 7000, dtype: object

In [28]:
# Create new Class TfidfVectorizer with max 5000 features
Tfidf_vect = TfidfVectorizer(max_features=5000)

# Learn vocabulary and idf from training set
Tfidf_vect.fit(Corpus['text_final'])

# Transfor both the train and the test to document-term matrix
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [30]:
# Check learned vocabulary
# print(Tfidf_vect.vocabulary_)

In [32]:
# Check vectorized data - ( Row number , unique integer assinged to the word)  Score calculated by TF-IDF Vectorizer
#print(Train_X_Tfidf) 

<center> 
    <b>
        Confusion Matrix
    </b>
</center> 
<center> 
<img src="img/Confusion_Matrix.png" aligment ='center' width="400" height="200">

## 4. ML Predection: Naive Bayes

Refer to: https://monkeylearn.com/blog/practical-explanation-naive-bayes-classifier/

In [33]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

# Print Confusion Matrix
Pred_Y = Naive.predict(Test_X_Tfidf)
confusion_matrix(Test_Y, Pred_Y)/len(Pred_Y)

Naive Bayes Accuracy Score ->  82.56666666666666


array([[0.42966667, 0.07733333],
       [0.097     , 0.396     ]])

# 5. ML Prediction : Support Vector Machine

Reference: https://towardsdatascience.com/support-vector-machine-simply-explained-fee28eba5496

In [34]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, Test_Y)*100)

# Print Confusion Matrix
Pred_Y = SVM.predict(Test_X_Tfidf)
confusion_matrix(Test_Y, Pred_Y)/len(Pred_Y)

SVM Accuracy Score ->  85.16666666666667


array([[0.436     , 0.071     ],
       [0.07733333, 0.41566667]])

## Next Steps

- Improve the Data pre-processing steps and see how it effects the accuracy.
- Try other Word Vectorization techniques such as Count Vectorizer and Word2Vec.
- Try Parameter tuning with the help of GridSearchCV on these Algorithms.
- Try other classification Algorithms Like Linear Classifier, Boosting Models and even Neural Networks.