In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import selenium
import time
from selenium import webdriver
from bs4 import BeautifulSoup as BS
from selenium.webdriver.chrome.options import Options
import sys
import pickle
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from Codes.Review_scrawler import *
from Codes.feature_extraction import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ramin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/ramin/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/ramin/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to /home/ramin/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ramin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Load the descriptions

In [3]:
# with open('product_names.txt', "r") as x:
#     all_product = x.read().splitlines()

In [4]:
# product_name_description = Scrawl_product_description('product_names.txt')

In [5]:
# with open('product_description.pickle', 'wb') as handle:
#     pickle.dump(product_name_description, handle, pickle.HIGHEST_PROTOCOL)

In [6]:
product_name_description = pickle.load(open('product_description.pickle', 'rb'))

In [7]:
sum_products = 0
for key in tqdm(product_name_description.keys()):
    sum_products += len(product_name_description[key])

100%|██████████| 455/455 [00:00<00:00, 839968.45it/s]


### The list of POS tags is as follows, with examples of what each POS stands for.

* CC coordinating conjunction
* CD cardinal digit
* DT determiner
* EX existential there (like: “there is” … think of it like “there exists”)
* FW foreign word
* IN preposition/subordinating conjunction
* JJ adjective ‘big’
* JJR adjective, comparative ‘bigger’
* JJS adjective, superlative ‘biggest’
* LS list marker 1)
* MD modal could, will
* NN noun, singular ‘desk’
* NNS noun plural ‘desks’
* NNP proper noun, singular ‘Harrison’
* NNPS proper noun, plural ‘Americans’
* PDT predeterminer ‘all the kids’
* POS possessive ending parent’s
* PRP personal pronoun I, he, she
* PRP possessive pronoun my, his, hers
* RB adverb very, silently,
* RBR adverb, comparative better
* RBS adverb, superlative best
* RP particle give up
* TO, to go ‘to’ the store.
* UH interjection, errrrrrrrm
* VB verb, base form take
* VBD verb, past tense took
* VBG verb, gerund/present participle taking
* VBN verb, past participle taken
* VBP verb, sing. present, non-3d take
* VBZ verb, 3rd person sing. present takes
* WDT wh-determiner which
* WP wh-pronoun who, what
* WP possessive wh-pronoun whose
* WRB wh-abverb where, when

## Feature Extraction - Basic Implementation


* __Step 1: Finding Candidate Features:__ First approach for extracting features was based on the intuition that product features are usually nouns or noun phrases [1]. Using the POS associated with words during the data generation phase, we created a new view of data where each sentence in the review was considered  as a bag of words. The  words  chosen  to  represent  a  sentence  were  those  that  were  marked  as  nouns  (NN/NNS).  We chose  to  ignore  proper  nouns,  which  we  believe  cannot  be  features  associated  with  a  product. 

* __Step 2: Finding Candidate Features:__ Furthermore, as we observed in the data, some phrases that represent features (such as optical zoom) were made of two classes of words, nouns and adjectives. So as to be able to detect such features, we also included words marked as adjectives (JJ/JJR/JJS) in our bag of words model.

* __Step 3: Finding Frequent Features:__ The  next  step  was  to  extract  frequent  features  from  the  candidate  feature  words.  We  used  an implementation  based  on  the  APRIORI  algorithm  [10]  for  identifying  frequently  occurring word/word  pairs  from  the  bag  of  words  data  model. Using  a  support  threshold  of  0.5%,  we  got  a good  set  of  candidate  features.  

* __Step 4: Remove Opinion Words:__ However,  we  observed  that  because  of  inclusion  of  adjectives  when finding the frequent item sets, we got many candidate features that actually were opinion words (like good,  best,  bad).  In  order  to  filter  out  such  frequent  items,  we  considered  only  those  single  items that  occurred  as a  noun  somewhere in  the  corpus, or  in  case  of  word phrases, if  at  least one  of the word in the phrase occurred as a noun somewhere in the corpus. 

* __Step 5: Feature Set:__ The resulting set of words was our feature set for the product trained on.

In [8]:
model = Features(product_name_description)

### Step 1 through 3

In [53]:
candidate_featres, pos = model.get_candidate_attributes(show = True, preprocess = True)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ramin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/ramin/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/ramin/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to /home/ramin/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ramin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[('premium', 'JJ'), ('full-grain', 'NN'), ('leather', 'NN'), ('upper', 'NN'), ('on', 'IN'), ('the', 'DT'), ('air', 'NN'), ('forc', 'VBZ'), ('1', 'CD'), ('low', 'JJ')]
[('perfor', 'NN'), ('for', 'IN'), ('enhanc', 'NN'), ('ventil', 'NN')]
[('pad', 'NN'), ('collar', 'NN'), ('for', 'IN'), ('a', 'DT'), ('snug', 'NN'), ('and', 'CC'), ('secur', 'NN')]
[('foam', 'NN'), ('midsol', 'NN'), ('with', 'IN'), ('nike', 'JJ'), ('air', 'NN'), ('unit', 'NN'), ('for', 'IN'), ('lightweight', 'JJ'), ('cushion', 'NN')]
[('pivot', 'NN'), ('point', 'NN'), ('in', 'IN'), ('the', 'DT'), ('forefoot', 'NN'), ('allow', 'NN'), ('for', 'IN'), ('smooth', 'JJ'), ('transit', 'NN'), ('in', 'IN'), ('all', 'DT'), ('direct', 'JJ')]
[('non-mark', 'JJ'), ('rubber', 'NN'), ('outsol', 'NN'), ('for', 'IN'), ('durabl', 'NN'), ('traction', 'NN')]
[('the', 'DT'), ('nike', 'JJ'), ('air', 'NN'), ('forc', 'VBD'), ('1', 'CD'), ('low', 'JJ'), ('is', 'VBZ'), ('import', 'NN')]
[('nobodi', 'JJ'), ('ever', 'RB'), ('said', 'VBD'), ('that', 'I

### Separate the data for Ramin and Jade to extract Rules from

In [3]:
import numpy as np

In [None]:
# ramin = np.random.choice(pos, 100)
# jade = np.random.choice(pos, 100)

In [None]:
# np.save('jade_pos.npy', jade)
# np.save('ramin_pos.npy', ramin)

### create candidate features

In [63]:
frequency_counts = model.get_frequent_attributes(support = 0.05)

In [64]:
frequency_counts

{'nike air': 0.14071146245059288,
 'air unit': 0.09189723320158102,
 'rubber outsol': 0.1359683794466403,
 'durabl traction': 0.06067193675889328,
 'is import': 0.2490118577075099,
 'casual shoe': 0.1409090909090909,
 'air max': 0.10968379446640317,
 'max air': 0.05849802371541502,
 'run shoe': 0.061462450592885375,
 'air vapormax': 0.05217391304347826}

### Create a dictionary of attributes and similar words using pretrained embedding matrix and cosine similarity

In [None]:
words_similarity = model.get_similar_attributes(preprocess = False)

In [43]:
words_similarity.keys()

dict_keys(['cushioning', 'outsole', 'midsole', 'insole', 'heel', 'color', 'shape', 'upper', 'fit', 'weight', 'density', 'fixation', 'collar', 'fasteners', 'permeability', 'stability', 'flexibility', 'traction', 'durability'])

In [49]:
import json

In [50]:
with open('Attributes_dictionary_with_similarities.json', 'w') as handle:
    json.dump(words_similarity, handle)

In [5]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [17]:
z = []

for x in ['hello', 'bad', 'good']:
    z.append(word2vec.get_vector(x))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ramin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/ramin/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/ramin/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to /home/ramin/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ramin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
np.sum(z, axis=0)

(300,)