<a href="https://colab.research.google.com/github/futurexskill/ai/blob/master/Decision_Tree_and_Random_Forest_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## NLP Text Classification

### Import the Libraries

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk


### Download NLTK

In [2]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

### Read the restaurant review file

In [0]:
# quoting = 3 will ignore double quotes
dataset = pd.read_csv('https://raw.githubusercontent.com/futurexskill/ai/master/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [4]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
# Sample sentence
dataset['Review'][0]

'Wow... Loved this place.'

In [6]:
# Sample sentence
dataset['Review'][6]

"Honeslty it didn't taste THAT fresh.)"

### Import Stop Words 

In [0]:
from nltk.corpus import stopwords


### Import Stemmer Class

In [0]:
from nltk.stem.porter import PorterStemmer


Instantiate the Stemmer

In [0]:
ps = PorterStemmer()

### Create a Corpus of clean text

Loop through all 1000 reviews
Apply Regular expression , Stemming and Stopwords to get a corpus of clean words

In [0]:
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

Now you have the corpus of clean text

In [11]:
## Sample sentence after cleansing, stemming and applying stop words
corpus[0]

'wow love place'

In [12]:
## Sample sentence after cleansing, stemming and applying stop words
corpus[6]

'honeslti tast fresh'

### Create the Tf-Idf model for all reviews

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 1500, min_df = 3, max_df = 0.6)


Store the featurized TF-IDF array in X 

In [0]:
X = vectorizer.fit_transform(corpus).toarray()

In [0]:
# TF-IDF vector for sample sentences
#X[0]

Store the Last column "Liked" in y

In [0]:
y = dataset.iloc[:, 1].values

### Train Test split

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

### Build a KNN Classifier

In [18]:
# Training the KNN model
from sklearn.neighbors import KNeighborsClassifier
# minkowski is for ecledian distance
classifierKNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifierKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

### Build a Naive Bayes Classifier

In [19]:
#from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
#classifier = GaussianNB()
classifierNB = MultinomialNB()
classifierNB.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Build a Decision Tree Classifier

In [20]:
from sklearn.tree import DecisionTreeClassifier
classifierDT = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifierDT.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

### Build a Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier
#n_estimators is the number of trees you want in the forest
classifierRF = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifierRF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

### Evaluate all models

In [0]:
y_pred_knn = classifierKNN.predict(X_test)

In [0]:
y_pred_NB = classifierNB.predict(X_test)

In [0]:
y_pred_DT = classifierDT.predict(X_test)

In [0]:
y_pred_RF = classifierRF.predict(X_test)

In [0]:
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix


In [27]:
cmknn = confusion_matrix(y_test, y_pred_knn)
cmknn


array([[87, 10],
       [74, 29]])

In [28]:
cmNB = confusion_matrix(y_test, y_pred_NB)
cmNB


array([[75, 22],
       [23, 80]])

In [29]:
cmDT = confusion_matrix(y_test, y_pred_DT)
cmDT

array([[74, 23],
       [37, 66]])

In [30]:
cmRF = confusion_matrix(y_test, y_pred_RF)
cmRF

array([[71, 26],
       [35, 68]])

In [31]:
print("KNN accuracy \n", accuracy_score(y_test,y_pred_knn))

KNN accuracy 
 0.58


In [32]:
print("Naive Bayes accuracy \n", accuracy_score(y_test,y_pred_NB))


Naive Bayes accuracy 
 0.775


In [33]:
print("Decision Tree accuracy \n", accuracy_score(y_test,y_pred_DT))

Decision Tree accuracy 
 0.7


In [34]:
print("Random Forest accuracy \n", accuracy_score(y_test,y_pred_RF))


Random Forest accuracy 
 0.695


## Naive Bayes gives higher accuracy so we will use that to predict output for new data 

In [0]:
sample = ["Good batting by England"]


In [0]:
# create the TF-IDF model of the sample sentence
sample = vectorizer.transform(sample).toarray()

In [37]:
#predict the sentiment
sentiment = classifierNB.predict(sample)
if (sentiment==1):
    print("Good Review")
else:
    print("Bad Review")

Good Review


In [38]:
sample2 = ["bad performance by India in the match"]
sample2 = vectorizer.transform(sample2).toarray()
sentiment2 = classifierNB.predict(sample2)
if (sentiment2==1):
    print("Good Review")
else:
    print("Bad Review")

Bad Review
