In [27]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import pickle

In [2]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
np.random.seed(500)

In [8]:
Corpus = pd.read_csv(r"/content/drive/MyDrive/text classifier/Reviews.csv",encoding='latin-1')

In [9]:
Corpus['Review Text'].dropna(inplace=True)   #step a

In [10]:
Corpus=Corpus.drop(['Clothing ID','Age','Title','Positive Feedback Count','Division Name','Department Name','Class Name','Rating','No'],axis=1)

In [11]:
print(Corpus)

                                             Review Text  Recommended IND
0      Absolutely wonderful - silky and sexy and comf...                1
1      Love this dress!  it's sooo pretty.  i happene...                1
2      I had such high hopes for this dress and reall...                0
3      I love, love, love this jumpsuit. it's fun, fl...                1
4      This shirt is very flattering to all due to th...                1
...                                                  ...              ...
23481  I was very happy to snag this dress at such a ...                1
23482  It reminds me of maternity clothes. soft, stre...                1
23483  This fit well, but the top was very see throug...                0
23484  I bought this dress for a wedding i have this ...                1
23485  This dress in a lovely platinum is feminine an...                1

[23486 rows x 2 columns]


In [12]:
Corpus['Review Text'] = [str(entry).lower() for entry in Corpus['Review Text']] #step b
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['Review Text']= [word_tokenize(str(entry)) for entry in Corpus['Review Text']] # Step - c

In [13]:
tag_map = defaultdict(lambda : wn.NOUN)           # Step - d
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [14]:
for index,entry in enumerate(Corpus['Review Text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        #print('word' + word)
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            #print('Word final: '+ str(word_Final))
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'Review_final'] = str(Final_words)

In [15]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['Review_final'],Corpus['Recommended IND'],test_size=0.3)

In [16]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [17]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['Review_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [18]:
print(Tfidf_vect.vocabulary_)



In [19]:
print(Train_X_Tfidf)

  (0, 4932)	0.133924567993478
  (0, 4781)	0.10113387035434288
  (0, 4771)	0.19972202143581577
  (0, 4569)	0.12934268845672395
  (0, 4538)	0.175205791398483
  (0, 3984)	0.1326418185171406
  (0, 3504)	0.17688625757983067
  (0, 3269)	0.11392234240183459
  (0, 3117)	0.11711408684037304
  (0, 3082)	0.084901315666007
  (0, 2897)	0.17141812215460542
  (0, 2759)	0.2221104911056041
  (0, 2478)	0.06323847947693689
  (0, 2469)	0.22113219024318706
  (0, 2455)	0.18697200436480538
  (0, 2441)	0.12260123860892133
  (0, 2090)	0.2781846027303782
  (0, 1914)	0.14485727983369565
  (0, 1857)	0.11441846542406578
  (0, 1644)	0.05759219324122955
  (0, 1567)	0.11115493603144652
  (0, 1478)	0.09955386700636334
  (0, 1403)	0.3259758175591977
  (0, 1314)	0.19972202143581577
  (0, 1065)	0.21926979493184512
  :	:
  (16439, 3969)	0.09474826355178112
  (16439, 3876)	0.06893886868903336
  (16439, 3271)	0.20145988875040297
  (16439, 3173)	0.1716059826496134
  (16439, 3084)	0.12201741236172213
  (16439, 3010)	0.2518484

In [24]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  89.00085154697702


In [25]:
#Predictions of Naive Bayes Classifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(Test_Y,predictions_SVM))
print(classification_report(Test_Y,predictions_SVM))
print(accuracy_score(Test_Y, predictions_SVM))

[[ 732  548]
 [ 227 5539]]
              precision    recall  f1-score   support

           0       0.76      0.57      0.65      1280
           1       0.91      0.96      0.93      5766

    accuracy                           0.89      7046
   macro avg       0.84      0.77      0.79      7046
weighted avg       0.88      0.89      0.88      7046

0.8900085154697701


In [29]:
with open('/content/drive/MyDrive/text classifier/text_classifier_SVM', 'wb') as picklefile:
    pickle.dump(SVM,picklefile)

In [21]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  83.55095089412433


In [23]:
#Predictions of Naive Bayes Classifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(Test_Y,predictions_NB))
print(classification_report(Test_Y,predictions_NB))
print(accuracy_score(Test_Y, predictions_NB))

[[ 142 1138]
 [  21 5745]]
              precision    recall  f1-score   support

           0       0.87      0.11      0.20      1280
           1       0.83      1.00      0.91      5766

    accuracy                           0.84      7046
   macro avg       0.85      0.55      0.55      7046
weighted avg       0.84      0.84      0.78      7046

0.8355095089412432


In [31]:
with open('/content/drive/MyDrive/text classifier/text_classifier_NaiveBayes', 'wb') as picklefile:
    pickle.dump(Naive,picklefile)
# code to load the pickle file
# with open('text_classifier', 'rb') as training_model:
#    model = pickle.load(training_model)
