## Topic Modelling 
- [helper functions](#helperFunctions)

### [Sentence Tokenizing](#sentTokenizing)

### [Cleaning](#cleaning)

### [NMF](#nmf)
- [5 topics](#nmf-5topics)
    - [topic breakdown](#topicBreakdown1)
- [4 topics](#nmf-4topics)
    - [topic breakdown](#topicBreakdown2)
    
### [NMF With Nouns](#nmfNouns)
- [model 1](#nmfNouns1)
    - [topic analysis](#nouns-ta)

### [Polarized topics](#polarized)

### [Unpolarized topics](#unpolarized)

In [10]:
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import scale
from sklearn.datasets import fetch_mldata
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from pymongo import MongoClient
import pandas as pd
import numpy as np
from seaborn import plt
import matplotlib.pyplot as mplt
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import time
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

In [2]:
client = MongoClient('ec2-34-198-179-91.compute-1.amazonaws.com', 27017)
db = client.fletcher
dress_col = db.rtr_dresses
rev_col = db.rtr_reviews

In [3]:
cur = rev_col.find({}, {"review":1, "title":1,"_id":0})
rev_df = pd.DataFrame(list(cur))

In [4]:
rev_df.columns

Index(['review', 'title'], dtype='object')

<a id="helperFunctions"></a>
## Helper Functions

In [46]:
def print_top_words(model, feature_names, n):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print ([word[0] for word in (sorted(list(zip(feature_names, topic)), key=lambda x: x[1], reverse=True)[:n])])
print()




In [60]:
def get_tfidf(text, stopwords, max_df=0.90, min_df=0.001, ngram=(2,2), vocab=None):
    tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
                                       ngram_range=ngram,
                                       stop_words=sw, vocabulary = vocab)
    t0 = time.time()
    tfidf = tfidf_vectorizer.fit_transform(text)
    return tfidf, tfidf_vectorizer

<a id="sentTokenizing"></a>
## Sentence Tokenization

In [7]:
sentences = rev_df.review.apply(sent_tokenize)

In [8]:
df_sent = pd.concat([pd.DataFrame({'review': x, 'index': i}) for i,x in enumerate(sentences)], ignore_index=True)

In [22]:
stemmer = SnowballStemmer('english')

In [16]:
df_sent.review = df_sent.review.str.lower()
df_sent.review = df_sent.review.str.replace(',', ' ')
df_sent.review = df_sent.review.str.replace('.', ' ')
df_sent.review = df_sent.review.str.replace('!', ' ')

In [31]:
def stem_sent(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [32]:
stemmed = df_sent.review.apply(stem_sent)

In [35]:
df_sent.review = stemmed

In [58]:
# df_sent.to_csv('../data/stemmed_reviews.csv')

<a id="cleaning"></a>
## Cleaning

<a id="nmf"></a>
## NMF

In [59]:
sw = stopwords.words('english')
tfidf, tfidf_vectorizer, tf, tf_vectorizer = get_tfidf_and_tf(df_sent.review, sw, min_df=0, max_df=0.5, ngram=(1,2))

done in 5.659s.
Extracting tf features for LDA...
done in 6.693s.


<a id="nmf-5topics"></a>
### 5 Topics

In [None]:
n_topics = 5
n_top_words = 20
t0 = time.time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time.time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [48]:
print_top_words(nmf, tfidf_feature_names, 10)

Topic #0:
['love', 'love dress', 'dress', 'absolut love', 'absolut', 'love love', 'overal love', 'overal', 'love wear', 'beauti']
Topic #1:
['perfect', 'fit', 'fit perfect', 'dress fit', 'dress', 'length', 'great', 'dress perfect', 'fit great', 'length perfect']
Topic #2:
['compliment', 'mani compliment', 'mani', 'got', 'receiv', 'receiv mani', 'got mani', 'night', 'compliment night', 'compliment dress']
Topic #3:
['size', 'true size', 'true', 'fit true', 'fit', 'veri', 'comfort', 'dress fit', 'dress', 'veri comfort']
Topic #4:
['rent', 'would', 'definit', 'dress', 'recommend', 'definit rent', 'would definit', 'rent dress', 'high recommend', 'would rent']


<a id="topicBreakdown1"></a>
### Topic breakdown
1. Topic 0 = Loved the dress
2. Topic 1 = Great fit
3. Topic 2 = Receive a lot of compliments
4. Topic 3 = True to size
5. Topic 4 = Would definitely recommend


<a id="nmfNouns"></a>
## NMF With Nouns

In [49]:
def noun(s):
    return ' '.join([word[0] for word in nltk.pos_tag(s) if word[1] == 'NN' or word[1] == 'NNS'])

In [50]:
word_tokens = df_sent.review.apply(nltk.word_tokenize)
nouns = word_tokens.apply(noun)

In [51]:
nouns.to_csv('../data/stemmed_nouns_per_sentence.csv')

In [52]:
df_sent['nouns'] = nouns

<a id="nmfNouns1"></a>
### Max df = 0.5

In [62]:
tfidf, tfidf_vectorizer = get_tfidf(df_sent.nouns, sw, min_df=0, max_df=0.5, ngram=(1, 3))

In [63]:
# Fit the NMF model
n_topics = 30
t0 = time.time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time.time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

done in 211.225s.

Topics in NMF model:


In [64]:
print_top_words(nmf, tfidf_feature_names, 10)

Topic #0:
['dress', 'dress fit', 'dress size', 'dress comfort', 'rent dress', 'dress dress', 'dress compliment', 'size dress', 'heel dress', 'beauti dress']
Topic #1:
['size', 'dress size', 'size size', 'order size', 'size fit', 'size dress', 'backup', 'size backup', 'order size size', 'size order']
Topic #2:
['fit', 'dress fit', 'size fit', 'fit dress', 'fit perfect', 'fit comfort', 'fit well', 'well', 'fit veri', 'fit flatter']
Topic #3:
['compliment', 'compliment night', 'dress compliment', 'receiv', 'receiv compliment', 'mani compliment', 'mani', 'lot compliment', 'dress compliment night', 'comfort compliment']
Topic #4:
['heel', 'length heel', 'heel length', 'heel dress', 'floor', 'dress heel', 'heel floor', 'ground', 'order heel', 'size heel']
Topic #5:
['comfort', 'dress comfort', 'comfort night', 'comfort dress', 'veri comfort', 'comfort easi', 'easi', 'fit comfort', 'size comfort', 'comfort flatter']
Topic #6:
['order', 'order size', 'order dress', 'size order', 'dress order',

<a id="nouns-ta"></a>
### Topic Analysis
- Topic #0: Dress 
- Topic #1: Fit
- Topic #2: Fit
- Topic #3: Compliments
- Topic #4: Length (+)
- Topic #5: Night
- Topic #6: Length 
- Topic #7: Bra (+)
- Topic #8: Bit
- Topic #9: RTR
- Topic #10: Material (+)
- Topic #11: Fit like a glove
- Topic #12: Wedding (+)
- Topic #13: Color (+)
- Topic #14: Event (+)
- Topic #15: Sequins (+)
- Topic #16: Pockets (+)
- Topic #17: Back (+)
- Topic #18: Heels (+) (kind of related to length)
- Topic #19: Perfect
- Topic #20: Fit
- Topic #21: Stretch (+)
- Topic #22: Compliments 
- Topic #23: Compliments
- Topic #24: RTR Experience
- Topic #25: Dress size
- Topic #26: Way
- Topic #27: Bust area (+)
- Topic #28: Lots
- Topic #29: Reviews

The ones with + are significant, and will be used as features of the dress

In [324]:
topic_prob = pd.DataFrame(nmf.transform(tfidf), columns=['topic_{}'.format(i) for i in range(30)])

In [325]:
good_cols = [1, 4, 7, 10, 12, 13, 14, 15, 16, 17, 18, 21, 27]
bad_cols = set(range(30)) - set(good_cols)

In [326]:
for col in bad_cols:
    del topic_prob['topic_{}'.format(col)]

In [327]:
topic_prob.columns = ['fit', 'length', 'bra', 'material', 'wedding', 'color', 'event', 'sequins', 'pockets', 'back', 'heels', 'stretch', 'bust_area']

I have recognized 3 different group of categories.
1. Body type related
    - length
    - stretch
    - bust area
2. General 
    - bra
    - material
    - wedding 
    - color 
    - event
    - sequins 
    - pockets 
    - back
    
For the categories that are body type related, it will be scored per body type per dress. For general features, it will be scored per dress.


For each category, there's also 3 ways we can "score" them
1. Polarity (Good/bad)
    - Body type related:
        - length
        - stretch 
        - bust area
    - General
        - material 
        - back 
        - sequins (itchy or not)
        - bra
        - color
2. Sum (How much it's mentioned)
    - sequins
    - wedding 
    - pockets
3. Categorical
    - bra 
    - event 
    - color
    


    

### Linking sentences to dress

In [149]:
url_cur = rev_col.find({}, {"url":1,"_id":0})

In [150]:
url_list = pd.DataFrame(list(url_cur))

### Linking body types to comments

In [223]:
df_body = pd.read_csv('../data/measurement_data.csv', index_col=0)

<a id="polarized"></a>
### Polarized topics

<a id="polarizedGeneral"></a>
#### General

In [144]:
from textblob import TextBlob
def calc_polarity(s):
    return TextBlob(s).sentiment[0]

In [145]:
polarity = df_sent.review.apply(calc_polarity)

In [328]:
topic_prob['topic'] = topic_prob.idxmax(axis=1)

In [329]:
p_topics = ['bra', 'material', 'color', 'sequins', 'back']

In [330]:
topic_prob = topic_prob.join(polarity)
topic_prob = topic_prob.join(df_sent['index'])
topic_prob = topic_prob.join(url_list, on='index')

In [331]:
topic_prob = topic_prob.rename_axis({'review': 'polarity'}, axis=1)

In [205]:
polar_general = topic_prob[topic_prob.topic.isin(p_topics)]

In [206]:
polar_general = pd.DataFrame(polar_general.groupby(['url', 'topic'], as_index=False)['review'].mean())

In [215]:
polar_general = polar_general.pivot('url', 'topic', 'review')

In [420]:
# topic_prob.to_csv('../data/topic_prob.csv')

<a id="polarizedBody"></a>
#### Body Type Specific


In [387]:
bt_topics = ['fit', 'length', 'stretch', 'bust_area']

In [405]:
polar_bt = topic_prob.join(df_body['kmean_label_2'], on='index')

In [406]:
polar_bt = polar_bt.dropna()

In [409]:
polar_bt = polar_bt[polar_bt[bt_topics].sum(axis=1) > 0]

In [411]:
polar_bt['topic'] = polar_bt[bt_topics].idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [412]:
polar_bt = polar_bt.groupby(['url','kmean_label_2', 'topic', 'index'], as_index=False)['polarity'].mean() # make sure 1 entry per user

In [413]:
polar_bt = polar_bt.groupby(['url','kmean_label_2', 'topic'], as_index=False)['polarity'].mean()

In [414]:
polar_bt = polar_bt.groupby(['url','kmean_label_2'], as_index=False)['polarity'].sum()

In [415]:
polar_bt = polar_bt.pivot('kmean_label_2', 'url', 'polarity')

<a id="unpolarized"></a>
#### Unpolarized topics

In [250]:
up_topics = ['sequins', 'wedding', 'pockets']


In [258]:
up_general = topic_prob.groupby('index', as_index=False)[up_topics].sum()

In [261]:
up_general = up_general.join(url_list)

In [264]:
up_general = up_general.groupby('url', as_index=False)[up_topics].mean()

<a id="combining"></a>
### Combining Everything

In [305]:
df_general = polar_general.join(up_general.set_index('url'), lsuffix='_polar', rsuffix='_unpolar')

In [307]:
df_general = df_general.replace(np.nan, 0)

#### Saving to CSV

In [308]:
df_general.to_csv('../data/dress_features.csv')

In [418]:
polar_bt.to_csv('../data/dress_features_bt.csv')

## Thoughts 
- Select which topics are dress related vs people related.
- Cluster on similar dresses by the dress features.
- Rate people on how much they love the dress. If they love the dress, they will love similar dresses too.
- If they input body data, recommend what people with the same body cluster love. 