In [67]:
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import scale
from sklearn.datasets import fetch_mldata
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from pymongo import MongoClient
import pandas as pd
import numpy as np
from seaborn import plt
import matplotlib.pyplot as mplt
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import time

In [2]:
client = MongoClient('ec2-34-198-179-91.compute-1.amazonaws.com', 27017)
db = client.fletcher
dress_col = db.rtr_dresses
rev_col = db.rtr_reviews

In [9]:
cur = rev_col.find({}, {"review":1, "title":1,"_id":0})
rev_df = pd.DataFrame(list(cur))

In [11]:
rev_df.columns

Index(['review', 'title'], dtype='object')

In [14]:
rev_df.head(5)

Unnamed: 0,review,title
0,"Fits true to size. I'm 145 lb, 5'1"" and the 10...",Stunning dress
1,I wish I could have gotten the 16L. I am 5'9 a...,Stunning dress! Super comfortable and I got lo...
2,I loved this dress so much! I got lots of comp...,Lots of compliments all night!
3,Great dress to highlight your waist with more ...,Comfortable and beautiful back detail
4,So many compliments on this dress. Would wear ...,GORGEOUS


In [48]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [187]:
def get_tfidf_and_tf(text, stopwords, max_df=0.90, min_df=0.001, ngram=(2,2), vocab=None):
    tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
                                       ngram_range=ngram,
                                       stop_words=sw, vocabulary = vocab)
    t0 = time.time()
    tfidf = tfidf_vectorizer.fit_transform(text)
    print("done in %0.3fs." % (time.time() - t0))

    # Use tf (raw term count) features for LDA.
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                                    ngram_range=ngram,
                                    stop_words=sw)
    t0 = time.time()
    tf = tf_vectorizer.fit_transform(text)
    print("done in %0.3fs." % (time.time() - t0))
    return tfidf, tfidf_vectorizer, tf, tf_vectorizer

In [163]:
import nltk
from nltk import word_tokenize

In [73]:
sentences = rev_df.review.apply(sent_tokenize)

In [105]:
df_sent = pd.concat([pd.DataFrame({'review': x, 'index': i}) for i,x in enumerate(sentences)], ignore_index=True)

## max df = 0.05
### 5 Topics

In [149]:
tfidf, tfidf_vectorizer, tf, tf_vectorizer = get_tfidf_and_tf(df_sent.review, sw, min_df=0, max_df=0.05, ngram=(1,2))

done in 2.248s.
Extracting tf features for LDA...
done in 1.778s.


In [150]:
# Fit the NMF model
n_topics = 5
n_top_words = 20
t0 = time.time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time.time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

done in 6.894s.

Topics in NMF model:
Topic #0:
loved, absolutely loved, absolutely, loved fit, loved pockets, pockets, overall loved, loved everything, overall, loved much, everything, everyone loved, loved wearing, everyone, much, loved would, loved loved, loved got, really loved, wearing
Topic #1:
many, many compliments, received, got, received many, got many, night, compliments night, received compliments, got compliments, night long, compliments throughout, throughout, felt, long, tons, compliments felt, tons compliments, throughout night, lots compliments
Topic #2:
true size, true, fit true, fits true, fits, runs true, runs, pretty true, size comfortable, pretty, ran true, ran, comfortable true, flattering, definitely true, perfect true, size flattering, perfectly, size little, size fit
Topic #3:
rent, definitely, definitely rent, would definitely, recommend, definitely recommend, highly, highly recommend, would rent, runway, rent runway, would recommend, would highly, rtr, renti

## Topic breakdown
1. Topic 0 = Loved the dress
2. Topic 1 = Received a lot of compliments
3. Topic 2 = Good fit, true to size.
4. Topic 3 = Would definitely rent again or recommend.
5. Topic 5 = Dress was beautiful.


### 9 Topics

In [134]:
# Fit the NMF model
n_topics = 9
n_top_words = 20
t0 = time.time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit({tfidf})
print("done in %0.3fs." % (time.time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

done in 11.397s.

Topics in NMF model:
Topic #0:
loved, absolutely loved, absolutely, loved fit, loved pockets, pockets, overall loved, loved everything, overall, loved much, everything, everyone loved, loved wearing, everyone, much, loved would, loved loved, really loved, loved got, wearing
Topic #1:
many, many compliments, received many, got many, received, got, compliments throughout, throughout, compliments felt, throughout night, many complements, never received, never, complements, felt, loved got, wedding, comfortable received, loved received, throughout evening
Topic #2:
true size, true, fit true, fits true, fits, runs true, runs, pretty true, size comfortable, pretty, ran true, ran, comfortable true, flattering, definitely true, perfect true, size flattering, size little, great true, size perfect
Topic #3:
rent, definitely, definitely rent, would definitely, would rent, definitely recommend, runway, rent runway, recommend, wait rent, rtr, wait, rent one, first, loved would, de

## Topic breakdown
1. Topic 0 = Loved the dress. Pockets Makes people happy.
2. Topic 1 = Received a lot of compliments
3. Topic 2 = Good fit, true to size. Flattering fit.
4. Topic 3 = Would definitely rent again or recommend.
5. Topic 4 = Dress was beautiful.
6. Topic 5 = Perfect fit
7. Topic 6 = Highly recomment
8. Topic 7 = Got compliments all night long.
9. Topic 8 = Great fit, and comfortable

** 5 is the better topic number. Topics 5 - 8 seems to be repeating itself. **



## max df = 0.02
### 15 Topics

In [152]:
tfidf, tfidf_vectorizer, tf, tf_vectorizer = get_tfidf_and_tf(df_sent.review, sw, min_df=0, max_df=0.02, ngram=(1, 1))

done in 0.911s.
Extracting tf features for LDA...
done in 1.169s.


In [153]:
# Fit the NMF model
n_topics = 15
n_top_words = 20
t0 = time.time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time.time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

done in 5.774s.

Topics in NMF model:
Topic #0:
overall, happy, though, rental, wonderful, experience, good, liked, awesome, occasion, choice, snug, ton, tons, look, nice, pleased, totally, fantastic, lots
Topic #1:
absolutely, stunning, renting, everything, fabulous, adored, person, tons, birthday, bachelorette, everyone, formal, gown, look, otherwise, anyone, million, throughout, cons, ton
Topic #2:
amazing, looked, looks, look, feel, truly, pictures, photos, service, sparkle, everyone, experience, quality, simply, otherwise, customer, tons, makes, worth, everything
Topic #3:
love, fell, sparkle, everything, sparkles, wanted, want, however, runway, style, buy, longer, fall, handles, another, though, put, thing, dresses, way
Topic #4:
runs, large, small, ran, chest, run, bust, reviews, normally, medium, say, top, said, area, smaller, slightly, usually, read, busted, typically
Topic #5:
gorgeous, sparkly, gown, elegant, person, tons, fits, otherwise, feel, look, classy, photographs, ne

In [121]:
topics = ['topic{}'.format(i) for i in range(n_topics)]

In [122]:
df_topics = pd.DataFrame(nmf.fit_transform(tfidf), columns=topics)

In [123]:
df_sent.join(df_topics)

Unnamed: 0,index,review,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,...,topic10,topic11,topic12,topic13,topic14,topic15,topic16,topic17,topic18,topic19
0,0,Fits true to size.,0.000000,0.000000,0.000000,0.085321,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0,"I'm 145 lb, 5'1"" and the 10R fit nice except i...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000583,...,0.001056,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.017189,0.008399
2,0,Got compliments from wedding guests I didn't e...,0.000000,0.013094,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.018549,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.003800,0.000000
3,1,I wish I could have gotten the 16L.,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.003182,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4,1,I am 5'9 and the 16R was a tad short with my h...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.015263,0.000000
5,1,Other than that I would recommend this dress f...,0.000000,0.000000,0.000000,0.000000,0.007487,0.000000,0.000000,0.000000,...,0.000000,0.003343,0.0,0.0,0.0,0.000000,0.000000,0.007110,0.000000,0.000205
6,2,I loved this dress so much!,0.046014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000507,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
7,2,I got lots of compliments and it was very comf...,0.000000,0.022825,0.000000,0.000000,0.000000,0.000000,0.030004,0.000000,...,0.032353,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
8,2,I wore a 2R but next time I would definitely g...,0.000000,0.000000,0.000000,0.000000,0.031354,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.012508,0.000000
9,2,Even with 3 inch heels I had to carry it every...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.021571,0.000000


In [196]:
from textblob import TextBlob
def get_nouns(text):
    return TextBlob(text).noun_phrases

In [198]:
df_nouns = rev_df['review'].apply(get_nouns)

In [201]:
noun_phrases = set()
null = df_nouns.apply(noun_phrases.update)

In [218]:
tfidf, tfidf_vectorizer, tf, tf_vectorizer = get_tfidf_and_tf(df_sent.review, sw, min_df=0.00001, max_df=0.01, ngram=(2, 4), vocab=noun_phrases)

done in 1.664s.
Extracting tf features for LDA...
done in 3.920s.


In [220]:
# Fit the NMF model
n_topics = 8
n_top_words = 20
t0 = time.time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time.time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

done in 3.375s.

Topics in NMF model:
Topic #0:
true size, size smaller, rib cage, size runs, tiny bit, right places, normal bra, fit runs, chest area, show stopper, normal size, backup size, high heels, beautiful person, short side, regular size, fit right, wiggle room, size petite, bit snug
Topic #1:
fit perfectly, right places, normal size, reading reviews, comfortable night, inch heel, usual size, dance night, regular size, short heels, holiday party, normal bra, backless bra, bit snug, badgley mischka, size 2r, safety pin, black tie, breast area, broad shoulders
Topic #2:
highly recommend, formal event, special occasion, black tie event, black tie, overall loved, formal affair, formal occasion, body type, backup size, backless bra, comfortable night, normal size, body types, formal wedding, great choice, badgley mischka, hourglass shape, size two, black dresses
Topic #3:
inch heels, regular length, long length, floor length, good length, right length, heels taller, perfect height,

## Noun phrases vocabulary 
I like these words!
1. Topic 0 = Size 
true size, size smaller, rib cage, size runs, tiny bit, right places, normal bra, fit runs, chest area, show stopper, normal size, backup size, high heels, beautiful person, short side, regular size, fit right, wiggle room, size petite, bit snug
2. Topic 1 = Fit
fit perfectly, right places, normal size, reading reviews, comfortable night, inch heel, usual size, dance night, regular size, short heels, holiday party, normal bra, backless bra, bit snug, badgley mischka, size 2r, safety pin, black tie, breast area, broad shoulders
2. Topic 2 = Event, special occasion?
highly recommend, formal event, special occasion, black tie event, black tie, overall loved, formal affair, formal occasion, body type, backup size, backless bra, comfortable night, normal size, body types, formal wedding, great choice, badgley mischka, hourglass shape, size two, black dresses
3. Topic 3 = Fit
inch heels, regular length, long length, floor length, good length, right length, heels taller, perfect height, long dresses, small train, bit shorter, size petite, inch loner, rtr experience, great length, comfortable night, normal size, short heels, perfect amount, perfect size
2. Topic 4 = Gold?
rose gold, gold color, gold gold, yellow gold, beautiful rose, true gold, gold jewelry, gold sequins, skin tone, gold heels, great rose, gorgeous rose, gold accessories, gold earrings, pretty rose, regular gold, gold shoes, champagne color, gold tint, true rose

2. Topic 5 = Bra problems
regular bra, back strap, bra straps, bra extender, regular bra straps, fashion tape, backless bra, bra strap, low bra, bra issues, wide straps, major concern, wardrobe tape, large chest, back high, safety pin, great length, low neckline, bra show, plunge bra

2. Topic 6 = Length, heels
perfect length, inch heel, regular length, right amount, high heels, shorter side, petite size, tall heels, tall girl, flat shoes, 4in heels, length size, right areas, body type, strapless bra, dance night, christmas party, right places, mid thigh, small waist
3. Topic 7 = overall feelings
absolutely loved, got tons, rtr experience, bachelorette party, christmas party, birthday party, formal wedding, wedding reception, countless compliments, absolutely amazing, slight train, sure rent, bit stretchy, formal gowns, loved rent, minor complaint, hotel wedding, formal function, crowd pleaser, bachorlette party

In [228]:
tfidf, tfidf_vectorizer, tf, tf_vectorizer = get_tfidf_and_tf(df_sent.review, sw, min_df=0.00001, max_df=0.0001, ngram=(1, 2))

done in 1.960s.
Extracting tf features for LDA...
done in 1.754s.


In [229]:
# Fit the NMF model
n_topics = 8
n_top_words = 20
t0 = time.time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time.time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

done in 1.625s.

Topics in NMF model:
Topic #0:
pretty runs, handy bust, zoom display, flows hide, flows nicely, flows perfectly, flows runs, flows walk, flowy bottom, flowy enough, flowy flattering, flowy fun, flowy gave, flowy huge, flowy order, flowy part, flowy poofy, flowy rarely, flowy self, flowy silhouette
Topic #1:
comfy flattering, followed recommendations, flows hide, flows hips, flows nicely, flows perfectly, flows runs, flows walk, flowy bottom, flowy enough, flowy flattering, flowy fun, flowy gave, flowy huge, flowy order, flowy part, flowy poofy, flowy rarely, flowy self, flowy silhouette
Topic #2:
back pockets, flattering true, zoom display, flowy huge, flows perfectly, flows runs, flows walk, flowy bottom, flowy enough, flowy flattering, flowy fun, flowy gave, flowy order, flows hips, flowy part, flowy poofy, flowy rarely, flowy self, flowy silhouette, flowy skirt
Topic #3:
definitely rtr, little downsize, downsize sequins, zoom display, flowy gave, flows nicely, flows

## Thoughts 
- Select which topics are dress related vs people related.
- Cluster on similar dresses by the dress features.
- Rate people on how much they love the dress. If they love the dress, they will love similar dresses too.
- If they input body data, recommend what people with the same body cluster love. 