## Word 2 Vec
### Goals 
1. Find negative reviews 
2. Grab the negative topics for each dress 
3. Extension: recommend size to pick out

## Table of content 
- [fetch reviews from mongo](#mongo)
- [preprocessing](#preprocessing)
- [word2vec model](#model)
- [word2vec model with bigrams](#bigram)

## Negative topics
- [get negative words](#negWords)
- [fetch negative sentences](#negSentences)
- [topic analysis on negative sentences](#topicModelling)
- [negative topics for body types](#negBt)

### Fetch reviews from Mongo

In [278]:
import pandas as pd
import pickle
import numpy as np
from sklearn.metrics import pairwise_distances
from pymongo import MongoClient
import nltk
import gensim
import string
from nltk.corpus import stopwords

In [2]:
client = MongoClient('ec2-34-198-179-91.compute-1.amazonaws.com', 27017)
db = client.fletcher
rev_col = db.rtr_reviews

In [406]:
df_rev = pd.DataFrame(list(rev_col.find({}, {'review': 1, '_id': 0} )))

In [316]:
df_dress = pd.DataFrame(list(rev_col.find({}, {'url': 1, '_id': 0} )))

<a id="preprocessing"></a>
### Preprocessing

In [4]:
sw = stopwords.words('english')

In [5]:
sw.remove('too')

In [7]:
tokens = df_rev.review.str.lower()
tokens = tokens.str.replace('-', ' ')
tokens = tokens.str.replace('.', ' ')

In [8]:
tokens = tokens.apply(nltk.word_tokenize)

In [9]:
texts = [[word for word in document if word not in sw and word[0] not in string.punctuation]
         for document in tokens]

<a id="model"></a>
### Word 2 Vec Model

In [253]:
model = gensim.models.Word2Vec(texts, size=100, window=5, min_count=1, workers=2,sg=1)

<a id="bigram"> </a>
### Word 2 Vec with bigram

In [10]:
bigram_transformer = gensim.models.Phrases(texts)

In [11]:
bigram_model = gensim.models.Word2Vec(bigram_transformer[texts], size=100, window=5, min_count=1, workers=2,sg=1)



In [12]:
bigram_model.save(open('../data/bigram_word2vec.bin', 'wb'))

## Negative Topics Analysis
<a id="negWords"></a>
### Fetching negative words 

In [254]:
negative_words = ['problem', 'con', 'issue']

In [257]:
problem_words = set()
for word in negative_words:
    problem_words = problem_words.union(set([word[0] for word in model.most_similar(word, topn=10)]))
problem_words = problem_words.union(set(negative_words))

In [258]:
problem_words

{'adjusted',
 'aspect',
 'besides',
 'complaint',
 'complaints',
 'con',
 'concern',
 'concerns',
 'doable',
 'downfall',
 'downside',
 'drawback',
 'flaw',
 'issue',
 'issues',
 'manageable',
 'picky',
 'problem',
 'problematic',
 'problems',
 'ridiculously',
 'stubborn',
 'thats',
 'tolerable',
 'trouble'}

<a id="negSentences"></a>
## Filter out the negative sentences 
- Count vectorizer 
- And then tfidf - svg to find the words

In [410]:
sentences = df_rev.review.apply(nltk.sent_tokenize)
df_sent = pd.concat([pd.DataFrame({'review': x, 'index': i}) for i,x in enumerate(sentences)], ignore_index=True)

### Cleaning

In [416]:
df_sent.review = df_sent.review.str.replace('.', ' ')
df_sent.review = df_sent.review.str.replace(',', ' ')
df_sent.review = df_sent.review.str.replace('-', ' ')
df_sent.review = df_sent.review.str.replace('!', ' ')

In [281]:
from sklearn.feature_extraction.text import CountVectorizer

In [636]:
# remove negative words that were negated
for word in ['issue', 'problem', 'complaint', 'trouble']:
    df_sent.review = df_sent.review.str.replace('no ' + word, ' ' )

In [639]:
# put the problem words as CountVectorizer's vocabulary
cv = CountVectorizer(vocabulary=problem_words)
count_vector = cv.fit_transform(df_sent.review)

In [640]:
df_problem = pd.DataFrame(count_vector.toarray(), columns= cv.get_feature_names())

In [641]:
# combine all of the problem counts
df_problem['sum'] = df_problem.sum(axis=1)
df_sent['sum'] = df_problem['sum']
df_problem_sent = df_sent[df_sent['sum'] > 0]
df_problem_sent = df_problem_sent.join(df_dress, on='index')

<a id="topicModelling"></a>
## Topic modelling on negative sentences

In [426]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [646]:
# add the problem words to the set, so that it doesn't mess up with the topic modelling
problem_sw = set(sw).union(set(problem_words))
problem_sw = problem_sw.union({'dress'})

In [647]:
tfidf_vectorizer = TfidfVectorizer(stop_words=problem_sw, ngram_range=(1,2))
tfidf = tfidf_vectorizer.fit_transform(df_problem_sent.review)

In [456]:
def print_top_words(model, feature_names, n):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print ([word[0] for word in (sorted(list(zip(feature_names, topic)), key=lambda x: x[1], reverse=True)[:n])])
print()




In [698]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=20, random_state=1,
          alpha=.1, l1_ratio=.5)
topic_vector = nmf.fit_transform(tfidf)

In [699]:
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), 20)

Topic #0:
['length', 'biggest length', 'biggest', 'perfect', 'heels length', 'length heels', 'short length', 'heels', 'length long', 'gown', 'length perfect', 'regular length', 'think length', 'size length', 'length too', 'one length', 'minor length', 'would', 'regular', 'would length']
Topic #1:
['zipper', 'zipper little', 'sticky', 'get', 'tough', 'sticky zipper', 'stuck', 'also zipper', 'mentioned', 'also', 'zipper tough', 'getting', 'getting zipper', 'zipper worked', 'side', 'way', 'lot zipper', 'otherwise', 'got', 'zipped']
Topic #2:
['long', 'too long', 'too', 'little long', 'sleeves', 'long enough', 'bit long', 'bit too', 'even', 'way too', 'long frame', 'frame', 'way', 'enough', 'tad', 'straps', 'heels', 'quite long', 'perfect', 'sleeves long']
Topic #3:
['sequins', 'arm', 'scratchy', 'itchy', 'sequins arms', 'underarms', 'sequins little', 'rub', 'skin', 'uncomfortable', 'scratching', 'sequins rub', 'sequins rubbing', 'hair', 'itchy sequins', 'rubbing', 'would', 'little scratch

- Topic #1:
['zipper', 'zipper little', 'sticky', 'get', 'tough', 'sticky zipper', 'stuck', 'also zipper', 'mentioned', 'also', 'zipper tough', 'getting', 'getting zipper', 'zipper worked', 'side', 'way', 'lot zipper', 'otherwise', 'got', 'zipped']
- Topic #2:
['long', 'too long', 'too', 'little long', 'sleeves', 'long enough', 'bit long', 'bit too', 'even', 'way too', 'long frame', 'frame', 'way', 'enough', 'tad', 'straps', 'heels', 'quite long', 'perfect', 'sleeves long']
- Topic #4:
['bra', 'back', 'wear bra', 'low', 'strapless', 'strapless bra', 'low back', 'back low', 'regular bra', 'wore', 'normal bra', 'without bra', 'normal', 'open', 'open back', 'bra back', 'regular', 'show', 'back bra', 'lower']
- Topic #5:
['short', 'too short', 'too', 'little too', 'little short', 'bit short', 'pretty short', 'short heels', 'short front', 'pretty', 'short length', 'length too', 'heels too', 'heels', 'too much', 'front', 'tall', 'much', 'short riding', 'regular']', 'night', 'really stretched', 'cut really', 'really comfortable', 'one', 'great really', 'material', 'run really']
- Topic #8:
['getting', 'kept', 'kept getting', 'hair', 'caught', 'getting caught', 'stuck', 'hair kept', 'getting stuck', 'caught sequins', 'wore hair', 'stuck sequins', 'zipper kept', 'lace', 'wore', 'sequins getting', 'back', 'get', 'get caught', 'night']
- Topic #18:
['low', 'cut', 'low cut', 'little low', 'front', 'pin', 'safety', 'cut front', 'safety pin', 'cut chest', 'chest', 'back low', 'cut really', 'neck', 'pretty low', 'taste', 'fixed', 'cleavage', 'cut low', 'front cut']

In [702]:
df_topics = pd.DataFrame(topic_vector)
# remove bad topics 
good_topics = {1, 2, 4, 5, 8, 18}
bad_topics = set(range(20)) - good_topics
for topic in bad_topics:
    del df_topics[topic]
# rename to understandable column names 
df_topics.columns = ['zipper', 'too_long', 'bra_prob', 'sequins_prob', 'too_short', 'low_cut']

In [None]:
# associate the topic probabilities to the sentences
df_problem_sent = df_problem_sent.reset_index().join(df_topics)
df_problem_sent = df_problem_sent.join(df_dress, on='index')

In [663]:
# group problems by dress
df_problems = df_problem_sent.groupby('url').mean()
del df_problems['index']
del df_problems['sum']

In [713]:
# normalize
from sklearn.preprocessing import normalize
df_norm = pd.DataFrame(normalize(df_problems, axis=0, norm='max'), columns = df_problems.columns)
df_norm.index = df_problems.index
df_norm.to_csv('../data/dress_problems.csv')

<a id="negBt"></a>
### Split negative topics per body type

In [502]:
df_body = pd.read_csv('../data/measurement_data.csv', index_col=0)
df_body_problems = df_problem_sent.join(df_body, on='index').dropna()
body_topics = ['too_long', 'too_short', 'low_cut'] # topics that are body type related

In [744]:
df_body_problems = df_body_problems[['url', 'kmean_label_2'] + body_topics]
# group by dress and cluster
df_body_problems = df_body_problems.groupby(['url', 'kmean_label_2'], as_index=False).mean()

In [749]:
df_body_problems['total_prob'] = df_body_problems.iloc[:, 2:].sum(axis=1)
df_body_problems = df_body_problems.pivot('kmean_label_2', 'url', 'total_prob')

In [751]:
# fill nans with 0s 
df_body_problems = df_body_problems.fillna(0)
df_body_problems = pd.DataFrame(normalize(df_body_problems, axis=0, norm='max'), columns= df_body_problems.columns)

In [753]:
df_body_problems.to_csv('../data/dress_features_bt.csv')