## Goals 
1. Perform topic modelling
        a. perhaps create additional features for make up
        b. create a useful comment search for users
2. Word 2 Vec
        a. To enhance performance of mapping questions/queries to comments

In [153]:
import pandas as pd
import numpy as np
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import nltk

import gensim

### Cleaning

In [3]:
dfc = pd.read_csv('../data/comments_data.csv')
text= dfc[['product_id', 'username', 'title', 'body', 'helpful' ,'not_helpful', 'star_count']]
text = text.reset_index()
text.columns = ['columnid'] + list(text.columns.values[1:])

In [30]:
dfp = pd.read_csv('../app/products.csv')
dfu = pd.read_csv('../app/users.csv')

In [37]:
text = text.join(dfu.set_index('username'), on='username')
text = text.join(dfp.set_index('product_id'), on='product_id')
text = text.drop(['username', 'product_id'], axis=1)

### Tokenizing and Text Cleaning
- split to sentences
- remove punctuation
- lowercase
- tokenize to words 

In [51]:
text['text'] = text.body.str.lower()
text = text[text.body.notnull()]

In [65]:
sentences = text.text.apply(sent_tokenize)
df_sent = pd.concat([pd.DataFrame({'commentid': i, 'comment': x}) for i,x in enumerate(sentences)], ignore_index=True)

In [66]:
df_sent.comment = df_sent.comment.str.replace('([\:\-\.\,\;\(\)\!])', ' ')
df_sent.comment = df_sent.comment.str.strip()
df_sent = df_sent[df_sent.comment.str.len() > 5]

In [99]:
df_sent.loc[:, 'comment'] = df_sent.comment.str.replace(' (\d+\w+) ', ' ') # words that has numbers and letters
df_sent.loc[:, 'comment'] = df_sent.comment.str.replace('(\w*\d+\w*)', ' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [116]:
tagged = nltk.pos_tag(tf_v2.get_feature_names()) # from a previous tf vectorizer's feature names
adjectives = [x[0] for x in tagged if x[1] in ['JJ', 'VBD', 'VBP', 'RB']] 

In [132]:
df_sent.to_pickle('df_sent.pkl')

In [134]:
sample = df_sent.sample(30000) # size of df to play around with

## Topic Modelling

#### TF Vectorizer

In [182]:
sw = stopwords.words('english')  + adjectives +['love', 'recommend', 'amazing', 'foundation', 'lip', 'product','mascara', 'concealer', 'eyeshadow', 'stuff']

In [183]:
tf_v = TfidfVectorizer(stop_words=sw, ngram_range=(1, 2), max_df=0.5)
tf_v = tf_v.fit(sample.comment)
tf = tf_v.transform(sample.comment)

### NMF Model

In [107]:
def print_words(nmf, tf_vectorizer):
    for topic_no, topic_vector in enumerate(nmf.components_):
        print('Topic {}'.format(topic_no))
        topic_weights = list(zip(tf_vectorizer.get_feature_names(), topic_vector))
        sorted_topics = sorted(topic_weights, key=lambda x:x[1], reverse=True)
        words = [x[0] for x in sorted_topics[:n_top_words]]
        print(", ".join(words))

In [184]:
n_topics = 30
n_top_words = 20
nmf = NMF(n_components=n_topics).fit(tf)
for topic_no, topic_vector in enumerate(nmf.components_):
    print('Topic {}'.format(topic_no))
    topic_weights = list(zip(tf_v.get_feature_names(), topic_vector))
    sorted_topics = sorted(topic_weights, key=lambda x:x[1], reverse=True)
    words = [x[0] for x in sorted_topics[:n_top_words]]
    print(", ".join(words))

Topic 0
color, match, color match, every color, color goes, pink, like color, dark, color payoff, every, payoff, color lasts, color lips, texture, color color, color texture, color light, color selection, lasting, selection
Topic 1
like, feel, feel like, look like, looks like, like wearing, wearing, feels, like color, wanted, feels like, wanted like, like matte, like look, face, better, would like, like one, like better, like powder
Topic 2
use, every, use every, every day, moisturizer, use moisturizer, use brush, eyes, eye, use use, use eyes, use anything, like use, face, would use, day use, liner, use face, use time, dark
Topic 3
coverage, gives, medium, medium coverage, gives coverage, light coverage, sheer, provides, coverage look, provides coverage, coverage without, sheer coverage, blends, cakey, build, feel, coverage light, amount, without, amount coverage
Topic 4
best, best used, one best, eyeliner, best eyeliner, hands, hands best, best part, best one, part, works best, best t

In [248]:
# some of the sentences doesn't have topics
remove_sentences = topic_probs.sum(axis=1) == 0
# noting down the sentences that doesn't have topics because the cosine distance will always be zero for these.
df_sent['no_topic'] = remove_sentences

In [185]:
# transform on the whole corpus
tfidf = tf_v.transform(df_sent.comment)
topic_probs = nmf.transform(tfidf)

In [326]:
# save all of the data needed for comment search
text.to_pickle('../data/text/comment_table.pkl')
pickle.dump(tf_v, open('../data/text/tf_vectorizer.pkl', 'wb'))
pickle.dump(nmf, open('../data/text/nmf.pkl', 'wb'))
df_sent.to_pickle('../data/text/df_sent.pkl')
pickle.dump(topic_probs, open('../data/text/topic_probs.pkl', 'wb'))

In [333]:
from ...libs import comment_finder

SystemError: Parent module '' not loaded, cannot perform relative import

## Topic analysis

In [334]:
nltk.pos_tag(['how', 'where', 'when', 'why'])

[('how', 'WRB'), ('where', 'WRB'), ('when', 'WRB'), ('why', 'WRB')]

In [341]:
nltk.pos_tag('is it good for formal events'.split())

[('is', 'VBZ'),
 ('it', 'PRP'),
 ('good', 'JJ'),
 ('for', 'IN'),
 ('formal', 'JJ'),
 ('events', 'NNS')]

In [171]:
tokens = df_sent.comment.apply(word_tokenize)

In [172]:
model = gensim.models.Word2Vec(tokens, size=100, window=5, min_count=1, workers=2,sg=1)

In [174]:
model.save(open('../data/word2vec.bin', 'wb'))

In [175]:
model.delete_temporary_training_data()

In [344]:
model.most_similar(negative='good')

[('mmuuuaaa', 0.26842302083969116),
 ('aarrgg', 0.24087241291999817),
 ('~sweetglamourmakeup', 0.2264493703842163),
 ('bummmmer', 0.22513119876384735),
 ('aagounthink', 0.22098058462142944),
 ('confounding', 0.2151275873184204),
 ('wronggggg', 0.20891711115837097),
 ('haaaaaaa', 0.2064480185508728),
 ('hibazmakeup', 0.20511563122272491),
 ('prrrrrrrrrrrrfection', 0.20327091217041016)]

# NEXT STEPS 
- do a tfidf with adjectives
- get the JJ and with word2vec, find words that are close by
- add this to the equation

## For questions 
- do a pos tag, and take only JJ, NN and NNS
- with NN and NNS, find comments with relevant topics
- with JJ, find the words that are close and far away from the JJ with the word2vec model.
- find a cosine similarity on the tfidf.