In [1]:
import spacy
import pandas as pd
import numpy as np
from spacy import displacy

In [2]:
# Read the dataset and remove rows have nan values
data = pd.read_csv("Womens Clothing E-Commerce Reviews.csv", header=0, usecols=['Review Text', 'Class Name']).dropna()

In [3]:
# EDA on class names/ product types
print(len(data["Class Name"].unique()))
product_types = list(data["Class Name"].unique())
print(product_types)

20
['Intimates', 'Dresses', 'Pants', 'Blouses', 'Knits', 'Outerwear', 'Lounge', 'Sweaters', 'Skirts', 'Fine gauge', 'Sleep', 'Jackets', 'Swim', 'Trend', 'Jeans', 'Legwear', 'Shorts', 'Layering', 'Casual bottoms', 'Chemises']


In [4]:
# EDA on data
data.iloc[0]

Review Text    Absolutely wonderful - silky and sexy and comf...
Class Name                                             Intimates
Name: 0, dtype: object

In [5]:
# Load the pretrained English dictionary
nlp = spacy.load('en')

In [8]:
# Sample code to tag sentences
doc = nlp(data.iloc[7,0])

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

I -PRON- PRON PRP nsubj X True True
ordered order VERB VBD ROOT xxxx True False
this this DET DT dobj xxxx True True
in in ADP IN prep xx True True
carbon carbon NOUN NN pobj xxxx True False
for for ADP IN prep xxx True True
store store NOUN NN compound xxxx True False
pick pick VERB VBP pobj xxxx True False
up up PART RP prt xx True True
, , PUNCT , punct , False False
and and CCONJ CC cc xxx True True
had have VERB VBD conj xxx True True
a a DET DT det x True True
ton ton NOUN NN dobj xxx True False
of of ADP IN prep xx True True
stuff stuff NOUN NN pobj xxxx True False
( ( PUNCT -LRB- punct ( False False
as as ADP IN prep xx True True
always always ADV RB pcomp xxxx True True
) ) PUNCT -RRB- punct ) False False
to to PART TO aux xx True True
try try VERB VB conj xxx True False
on on PART RP prt xx True True
and and CCONJ CC cc xxx True True
used use VERB VBD conj xxxx True True
this this DET DT det xxxx True True
top top NOUN NN dobj xxx True True
to to PART TO aux xx True True
pair

In [9]:
#  Sample code for visualizing tags
displacy.render(doc, style='dep', jupyter=True, options={'distance': 140})

In [6]:
# Sample code for in-text tagging
print(data.iloc[7,1])
doc = nlp(data.iloc[7,0])
displacy.render(doc, style='ent', jupyter=True)

Knits


In [18]:
# Code that tags the class types based on the existing pretrained dictionary
# Ideally, all the product names should be Nouns, but there may be some cases where they can be adjectives
# or some other POS tags
# We will ignore them first, but we need to retrain our model and make them Nouns to imporve accuracy
for product_type in product_types:
    doc = nlp(product_type)

    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                token.shape_, token.is_alpha, token.is_stop)

Intimates intimate NOUN NNS ROOT Xxxxx True False
Dresses dress NOUN NNS ROOT Xxxxx True False
Pants pant NOUN NNS ROOT Xxxxx True False
Blouses blouse NOUN NNS ROOT Xxxxx True False
Knits knit NOUN NNS ROOT Xxxxx True False
Outerwear Outerwear PROPN NNP ROOT Xxxxx True False
Lounge Lounge PROPN NNP ROOT Xxxxx True False
Sweaters sweater NOUN NNS ROOT Xxxxx True False
Skirts skirt NOUN NNS ROOT Xxxxx True False
Fine fine ADJ JJ amod Xxxx True False
gauge gauge NOUN NN ROOT xxxx True False
Sleep sleep VERB VB ROOT Xxxxx True False
Jackets jacket NOUN NNS ROOT Xxxxx True False
Swim Swim PROPN NNP ROOT Xxxx True False
Trend trend NOUN NN ROOT Xxxxx True False
Jeans jean NOUN NNS ROOT Xxxxx True False
Legwear legwear NOUN NN ROOT Xxxxx True False
Shorts short NOUN NNS ROOT Xxxxx True False
Layering layer VERB VBG ROOT Xxxxx True False
Casual casual ADJ JJ amod Xxxxx True False
bottoms bottom NOUN NNS ROOT xxxx True False
Chemises chemise NOUN NNS ROOT Xxxxx True False


In [7]:
# Filtering the dataset to analyse only seven classes
focussed_classes = ["Sweaters", "Pants", "Jackets", "Blouses", "Skirts", "Jeans", "Shorts"]
sample_data = data[data["Class Name"].isin(focussed_classes)]
sample_data["Prediction"] = None
lemmatized_classes  = {}
for product_type in focussed_classes:
    doc = nlp(product_type)
    for token in doc:
        lemmatized_classes[token.lemma_] = token.text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [8]:
# Base testing: If the word appears, then simply classify it
sample_data.iloc[0]

Review Text    I love, love, love this jumpsuit. it's fun, fl...
Class Name                                                 Pants
Prediction                                                  None
Name: 3, dtype: object

In [9]:
# Do a simple matching and predict the labels if the word exists in the review
for index, row in sample_data.iterrows():
    doc = nlp(row["Review Text"])

    for token in doc:
        if token.lemma_ in list(lemmatized_classes.keys()):
            row["Prediction"] = lemmatized_classes[token.lemma_]
            break

In [10]:
# Compute class wise BASELINE accuracy
actual_data = sample_data.groupby("Class Name")["Prediction"]
matched_data = sample_data[sample_data["Class Name"] == sample_data["Prediction"]].groupby("Class Name")["Prediction"]
print(actual_data.count())
print(matched_data.count())
print("Class wise accuracy: ")
print(matched_data.count()*100/actual_data.count())


Class Name
Blouses     1211
Jackets      430
Jeans        825
Pants        793
Shorts       199
Skirts       780
Sweaters    1007
Name: Prediction, dtype: int64
Class Name
Blouses     475
Jackets     323
Jeans       599
Pants       642
Shorts      186
Skirts      733
Sweaters    807
Name: Prediction, dtype: int64
Class wise accuracy: 
Class Name
Blouses     39.223782
Jackets     75.116279
Jeans       72.606061
Pants       80.958386
Shorts      93.467337
Skirts      93.974359
Sweaters    80.139027
Name: Prediction, dtype: float64


In [19]:
# Compute the overall Baseline accuracy
overall_accuracy = sample_data[sample_data["Class Name"] == sample_data["Prediction"]].count() /sample_data.count()

print(overall_accuracy)

Review Text    0.432411
Class Name     0.432411
Prediction     0.717827
dtype: float64


In [89]:
# Pick each class and try to find the mistakes and retrain the model to improve the model
dress_incorrect_data = sample_data[(sample_data["Class Name"] == "Dresses") & 
                                   (sample_data["Class Name"] != sample_data["Prediction"])]

In [218]:
intimate_incorrect_data = sample_data[(sample_data["Class Name"] == "Intimates") & 
                                   (sample_data["Class Name"] != sample_data["Prediction"])]

In [219]:
pant_incorrect_data = sample_data[(sample_data["Class Name"] == "Pants") & 
                                   (sample_data["Class Name"] != sample_data["Prediction"])]

In [92]:
dress_incorrect_data.iloc[0][0]

"First of all, this is not pullover styling. there is a side zipper. i wouldn't have purchased it if i knew there was a side zipper because i have a large bust and side zippers are next to impossible for me.\n\nsecond of all, the tulle feels and looks cheap and the slip has an awkward tight shape underneath.\n\nnot at all what is looks like or is described as. sadly will be returning, but i'm sure i will find something to exchange it for!"

In [93]:
# Analyzing the incorrect text using Spacy tags 
doc = nlp(dress_incorrect_data.iloc[0][0])

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

First first ADV RB advmod Xxxxx True True
of of ADP IN prep xx True True
all all DET DT pobj xxx True True
, , PUNCT , punct , False False
this this DET DT nsubj xxxx True True
is be VERB VBZ ROOT xx True True
not not ADV RB neg xxx True True
pullover pullover NOUN NN amod xxxx True False
styling styling NOUN NN attr xxxx True False
. . PUNCT . punct . False False
there there ADV EX expl xxxx True True
is be VERB VBZ ROOT xx True True
a a DET DT det x True True
side side NOUN NN compound xxxx True True
zipper zipper NOUN NN attr xxxx True False
. . PUNCT . punct . False False
i i PRON PRP nsubj x True True
would would AUX MD aux xxxx True True
n't not ADV RB neg x'x False True
have have VERB VB aux xxxx True True
purchased purchase VERB VBN ROOT xxxx True False
it -PRON- PRON PRP dobj xx True True
if if ADP IN mark xx True True
i i PRON PRP nsubj x True True
knew know VERB VBD advcl xxxx True False
there there ADV EX expl xxxx True True
was be VERB VBD ccomp xxx True True
a a DET DT de

# Training the data to improve the classification accuracy

In [11]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [12]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [13]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [14]:
# Create a BOW vector and TFD-IDF vector
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [15]:
from sklearn.model_selection import train_test_split

X = sample_data['Review Text'] # the features we want to analyze
ylabels = sample_data['Class Name'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [16]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x118ee4048>),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=...
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x118c02ae8>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                         

In [17]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted, average='weighted'))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted, average='weighted'))

# Classification report
print("Classification Report:")
print(metrics.classification_report(y_test, predicted))

#Get the confusion matrix
cm = metrics.confusion_matrix(y_test, predicted)
print(cm)
cm = cm.astype('float') * 100/ cm.sum(axis=1)[:, np.newaxis]
print("Class wise accuracy")
print(cm.diagonal())

Logistic Regression Accuracy: 0.7757367011098354
Logistic Regression Precision: 0.7898371579896338
Logistic Regression Recall: 0.7757367011098354
Classification Report:
              precision    recall  f1-score   support

     Blouses       0.71      0.95      0.82       897
     Jackets       0.83      0.54      0.66       217
       Jeans       0.85      0.69      0.76       360
       Pants       0.72      0.69      0.71       412
      Shorts       0.78      0.39      0.52        71
      Skirts       0.94      0.80      0.87       269
    Sweaters       0.86      0.73      0.79       387

    accuracy                           0.78      2613
   macro avg       0.81      0.69      0.73      2613
weighted avg       0.79      0.78      0.77      2613

[[852   5  15  12   0   5   8]
 [ 63 118   1   5   0   2  28]
 [ 38   0 248  68   1   1   4]
 [ 95   2  20 284   5   2   4]
 [ 21   0   6  14  28   0   2]
 [ 42   1   1   7   1 216   1]
 [ 81  17   1   2   1   4 281]]
Class wise accur

In [18]:
# Accuracy on the entire dataset to compare it with the baseline accuracy
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(ylabels, predicted))
print("Logistic Regression Precision:",metrics.precision_score(ylabels, predicted, average='weighted'))
print("Logistic Regression Recall:",metrics.recall_score(ylabels, predicted, average='weighted'))

# Classification report
print("Classification Report:")
print(metrics.classification_report(ylabels, predicted))

#Get the confusion matrix
cm = metrics.confusion_matrix(ylabels, predicted)
print(cm)
cm = cm.astype('float') * 100/ cm.sum(axis=1)[:, np.newaxis]
print("Class wise accuracy")
print(cm.diagonal())

Logistic Regression Accuracy: 0.8247387159756517
Logistic Regression Precision: 0.8368022461103791
Logistic Regression Recall: 0.8247387159756517
Classification Report:
              precision    recall  f1-score   support

     Blouses       0.76      0.97      0.85      2983
     Jackets       0.86      0.60      0.71       683
       Jeans       0.90      0.77      0.83      1104
       Pants       0.80      0.78      0.79      1350
      Shorts       0.85      0.49      0.62       304
      Skirts       0.95      0.83      0.89       903
    Sweaters       0.90      0.79      0.84      1380

    accuracy                           0.82      8707
   macro avg       0.86      0.75      0.79      8707
weighted avg       0.84      0.82      0.82      8707

[[2881   13   25   22    1   23   18]
 [ 181  413    3   11    1    4   70]
 [  89    0  848  156    1    1    9]
 [ 212    3   41 1058   21    2   13]
 [  93    0   15   44  149    1    2]
 [ 120    3    4   21    1  747    7]
 [ 224

# TAG using phrase matching

In [34]:
nlp = spacy.load('en')

In [44]:
# Create topic keywords which are the review texts
# Create topic labels which are the class names of the corresponding text

topic_labels = list()
topic_keywords = list()
topic_keywords+= (list(sample_data[sample_data["Review Text"] == "Blouses"]["Review Text"].iloc[:100]))
topic_labels+= (list(sample_data[sample_data["Class Name"] == "Blouses"]["Class Name"].iloc[:100]))
topic_keywords+= (list(sample_data[sample_data["Review Text"] == "Jackets"]["Review Text"].iloc[:100]))
topic_labels+= (list(sample_data[sample_data["Class Name"] == "Jackets"]["Class Name"].iloc[:100]))
topic_keywords+= (list(sample_data[sample_data["Review Text"] == "Jeans"]["Review Text"].iloc[:100]))
topic_labels+= (list(sample_data[sample_data["Class Name"] == "Jeans"]["Class Name"].iloc[:100]))
topic_keywords+= (list(sample_data[sample_data["Review Text"] == "Pants"]["Review Text"].iloc[:100]))
topic_labels+= (list(sample_data[sample_data["Class Name"] == "Pants"]["Class Name"].iloc[:100]))
topic_keywords+= (list(sample_data[sample_data["Review Text"] == "Shorts"]["Review Text"].iloc[:100]))
topic_labels+= (list(sample_data[sample_data["Class Name"] == "Shorts"]["Class Name"].iloc[:100]))
topic_keywords+= (list(sample_data[sample_data["Review Text"] == "Skirts"]["Review Text"].iloc[:100]))
topic_labels+= (list(sample_data[sample_data["Class Name"] == "Skirts"]["Class Name"].iloc[:100]))
topic_keywords+= (list(sample_data[sample_data["Review Text"] == "Sweaters"]["Review Text"].iloc[:100]))
topic_labels+= (list(sample_data[sample_data["Class Name"] == "Sweaters"]["Class Name"].iloc[:100]))

In [37]:
import itertools
import numpy as np

# Use Spacy to create vectors for the topic keywords.
# Spacy uses Word2Vec to build vectors for the given text
# We have used Spacy to build these Word2vec vectors

topic_docs = list(nlp.pipe(topic_keywords,
  batch_size=10000,
  n_threads=3))

topic_vectors = np.array([doc.vector 
  if doc.has_vector else spacy.vocab[0].vector
  for doc in topic_docs])

print('Vector for topic ', topic_labels[0])
print(topic_vectors[0])

Vector for topic  Pants
[ 1.1250247  -0.81393766 -0.2984221   0.15839224 -1.6652483   0.3881067
  1.755843   -0.18485169 -1.9444047   0.81113094  0.7508667   1.2236611
 -1.3347745  -0.8833289  -0.18041697  0.02063254 -0.81741995  0.2799068
  1.6763984   2.0720046   0.4081799  -0.9462572   1.024182    1.1441406
 -0.46420565  1.5599277  -0.6719999   0.35884774 -0.39633262  1.6408604
  0.58931094 -1.5503886  -1.2633742  -0.5627806   0.16534436  2.6096816
  1.7443753   0.00416802 -1.8797693   0.23669286 -0.38807353  0.13208136
  1.3185614   0.941125   -2.0563557  -0.8544247  -0.25165325 -0.80684394
 -0.47863632 -1.566288   -0.9362541  -0.39146248 -0.7874055  -0.5434709
 -0.46361047 -1.4611995  -0.19688761  0.17598082  0.77680564  0.15941615
  0.12461584 -0.38863635  1.2813896   0.3946074  -0.70306915  1.3341683
  0.82879204  0.89577717  0.28470796  1.2661424   0.89462024 -2.1395433
  0.39478377 -0.26936334  0.7851856   0.73599315 -0.20895465  0.7235232
 -0.34808353 -1.3365539   0.2286754  

In [54]:
# Given a new review text, compute its Word2vec vector using Spacy
# Find the most similar vector using cosine similarity in the corpus
# And return its corresponding label

keywords_list = list()
keywords_list.append(sample_data["Review Text"].iloc[101:151])
labels = list()
labels.append(sample_data["Class Name"].iloc[101:151])
count=0
for i in range(len(keywords_list)):
    keywords = keywords_list[i]
    keyword_docs = list(nlp.pipe(keywords,
      batch_size=100,
      n_threads=3))

    keyword_vectors = np.array([doc.vector
      if doc.has_vector else spacy.vocab[0].vector
      for doc in keyword_docs])
    
    from sklearn.metrics.pairwise import cosine_similarity
    # use numpy and scikit-learn vectorized implementations for performances
    simple_sim = cosine_similarity(keyword_vectors, topic_vectors)
    topic_idx = simple_sim.argmax(axis=1)
    if (topic_labels[i])
    count++
    
    print(simple_sim)

    print('Vector for keyword "%s": ' % keywords[0])
    print(keyword_vectors[0])

In [40]:
from sklearn.metrics.pairwise import cosine_similarity
# use numpy and scikit-learn vectorized implementations for performances
simple_sim = cosine_similarity(keyword_vectors, topic_vectors)
topic_idx = simple_sim.argmax(axis=1)
print(simple_sim)

[[0.71354026 0.9246205  0.8683945  ... 0.9052511  0.8458234  0.9092554 ]
 [0.71354026 0.9246205  0.8683945  ... 0.9052511  0.8458234  0.9092554 ]
 [0.71354026 0.9246205  0.8683945  ... 0.9052511  0.8458234  0.9092554 ]]


In [41]:
for k, i in zip(keywords, topic_idx):
  print('"%s" is about %s' %(k, topic_labels[i]))

"I wanted to love this jacket. so soft and great color. unfortunately it is just too light weight. it is almost like shirt fabric. nice heft to the fabric though. nice length - accept the sleeves were short on me. i am 5'10 and 135lbs. the small was the right size. i was looking for something slightly warmer to layer." is about Jackets
"I wanted to love this jacket. so soft and great color. unfortunately it is just too light weight. it is almost like shirt fabric. nice heft to the fabric though. nice length - accept the sleeves were short on me. i am 5'10 and 135lbs. the small was the right size. i was looking for something slightly warmer to layer." is about Jackets
"I wanted to love this jacket. so soft and great color. unfortunately it is just too light weight. it is almost like shirt fabric. nice heft to the fabric though. nice length - accept the sleeves were short on me. i am 5'10 and 135lbs. the small was the right size. i was looking for something slightly warmer to layer." is 

# Extracting Nouns from sentence and finding Word2Vec similarity

In [95]:
# We have used Word2vec from Gensim for this approach
from gensim.models import Word2Vec

In [1]:
# Use Spacy to extract the nouns from the reviews
# We have appended the class name to each noun vector so that Word2vec can pick it and give similar vectors

sentence_matrix = list()
noun_matrix = list()
for index, row in sample_data.iterrows():
    sentence = list()
    nouns = []
    for token in nlp(row['Review Text']):
        if token.pos_ == 'NOUN':
            sentence.append(token.lemma_)
            info = [token.text, token.lemma_, token.idx]
            nouns.append(info)
    for token in nlp(row['Class Name']):
        sentence.append(token.lemma_)
    sentence_matrix.append(sentence)
    noun_matrix.append(nouns)

NameError: name 'sample_data' is not defined

In [117]:
print(sentence_matrix[0])
print(noun_matrix[3])

['love', 'jumpsuit', 'fun', 'flirty', 'time', 'nothing', 'compliment', 'pant']
[['Material', 'material', 0], ['color', 'color', 13], ['leg', 'leg', 33], ['opening', 'opening', 37], ['length', 'length', 85], ['ankle', 'ankle', 115], ['leg', 'leg', 130], ['size', 'size', 146], ['waist', 'waist', 157], ['line', 'line', 171], ['ankle', 'ankle', 185], ['pleats', 'pleat', 202], ['look', 'look', 280], ['height', 'height', 315]]


In [118]:
# Compute the Word2vec vectors for each word
model = Word2Vec(sentence_matrix, min_count=1)  # default value is 5

In [146]:
# Find the most similar words to the class names
# These list will serve as the named entities
res = dict()
res['sweater'] = set()
res['pant'] = set()
res['jacket'] = set()
res['blouse'] = set()
res['skirt'] = set()
res['jean'] = set()
res['short'] = set()
for j in range(len(sentence_matrix)):
    trimmed_class_labels = ['sweater', 'pant', 'jacket', 'blouse', 'skirt', 'jean', 'short']
    for i in range(len(sentence_matrix[j])-1):
        sim = np.zeros(len(trimmed_class_labels))
        for k in range(len(trimmed_class_labels)):
            label = trimmed_class_labels[k]
            try:
                sim[k] = model.wv.similarity(label, sentence_matrix[j][i])
            except:
                sim[k] = 0
        # We have set the thresold empirically by analyzing the corresponding named entities
        # This threshold may change depending on the dataset
        if np.max(sim) > 0.9985:
            res[trimmed_class_labels[np.argmax(sim)]].add(sentence_matrix[j][i])
print(res)

{'sweater': {'sweater'}, 'pant': {'ankle', 'stretch', 'pant', 'pilcro'}, 'jacket': {'coat', 'jacket'}, 'blouse': {'sleeve', 'shirt', 'top', 'bra', 'blouse'}, 'skirt': {'skirt'}, 'jean': {'jean'}, 'short': {'ag', 'thigh', 'rise', 'short', 'heel'}}


# Training Product entities on Spacy

In [20]:
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [147]:
# We will find the named entities and its location in the review list
# This is done as Spacy needs a specific input data format which lists the location of each named entity in 
# sentence in order to train its NER model
train_data = list()
for index, row in sample_data.iterrows():
    entity = dict()
    entity['entities'] = list()
    for token in nlp(row['Review Text']):
        if token.pos_ == 'NOUN':
            i=0
            for key in res:
                for value in res[key]:
                    if value == token.lemma_ :
                        entity['entities'].append((token.idx, token.idx + len(token.text), focussed_classes[i]))
                i+=1
    if len(entity['entities']) != 0:
        train_data.append((row['Review Text'], entity))

In [150]:
train_data[3]

("I love this shirt because when i first saw it, i wasn't sure if it was a shirt or dress. since it is see-through if you wear it like a dress you will need a slip or wear it with leggings. i bought a slip, wore the tie in the back, and rocked it with white wedges. you could also wear it as a vest. be careful with the buttons. i haven't had any fall off yet, but i feel like they will. overall it's great for any occasion and it's fun to wear!",
 {'entities': [(12, 17, 'Blouses'), (73, 78, 'Blouses')]})

In [2]:
# Reference: https://github.com/explosion/spaCy/blob/master/examples/training/train_ner.py
# This module trains the Spacy NER model on a blank English dictionary
# We can even train on pre-trained NER model if we want to keep the previous tags as well

nlp_trained = spacy.blank("en")
if "ner" not in nlp_trained.pipe_names:
    ner = nlp_trained.create_pipe("ner")
    nlp_trained.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp_trained.get_pipe("ner")

# add labels
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp_trained.pipe_names if pipe != "ner"]
with nlp_trained.disable_pipes(*other_pipes):  # only train NER
    # reset and initialize the weights randomly – but only if we're
    # training a new model
    nlp_trained.begin_training()
    for itn in range(20):
        random.shuffle(train_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp_trained.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                losses=losses,
            )
        print("Losses", losses)

NameError: name 'spacy' is not defined

In [21]:
output_dir = 'model'

# save the trained model
# nlp_trained.to_disk(output_dir)
# print("Saved model to", output_dir)

# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)

Loading from model


In [22]:
# Testing results
doc = nlp(sample_data.iloc[9,0])
displacy.render(doc, style='ent', jupyter=True)

  "__main__", mod_spec)


In [24]:
doc = nlp2(sample_data.iloc[9,0])
displacy.render(doc, style='ent', jupyter=True)

In [31]:
test_doc = "The paparazzi must be pleased with the vogue for buffalo plaid — with all the acres of fabric, deep red and black thickly crisscrossing, marking their quarry clearly. Have you recently breezed through the celebrity-photo pages? Reese Witherspoon snapped, leaving her car in a parking lot, while wearing a buffalo-plaid shirt and a smart skirt. Gwen Stefani, spotted toting her toddler to church, has bundled the boy in a logger shirt. Here we have Kylie Jenner — that ‘‘Keeping Up With the Kardashians’’ fixture — in a buffalo-plaid shirtdress of shocking blue. Pictures of the dress stimulated such demand that its manufacturer rushed it back into production."


In [32]:
doc = nlp(test_doc)
displacy.render(doc, style='ent', jupyter=True)

In [33]:
doc = nlp2(test_doc)
displacy.render(doc, style='ent', jupyter=True)