In [10]:
import io
import numpy as np
import pandas as pd
import jieba
import jieba.posseg as pseg
from pymongo import MongoClient
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

pd.set_option('display.max_columns', 500)

# 1. Data Preprocessing

In [6]:
def short_comments_df(short_ratings_df):
    '''Takes in short_ratings dataframe, group by perfume id, return a dictionary
    with perfume id as key, all short comments of that perfume as values

    Parameter:
    ---------
    short ratings dataframe. Including perfume_id, rated_user_id, comments, url

    Return:
    -------
    dataframe, perfume_id as index, perfume short comments as another column
    '''
    scomments = defaultdict(list)
    for pid in short_ratings_df['perfume_id'].unique():
        df = short_ratings_df[(short_ratings_df['perfume_id'] == pid)]
        for c in df['short_comment']:
            scomments[pid].append(c)
    stacked = pd.DataFrame.from_dict(scomments, orient='index').stack().sum(level=0) # aggregate comments to perfume id
    short_comments_df = pd.DataFrame(stacked).rename(columns={0:'short_comments'}) # convert from pd series to dataframe
    return short_comments_df

In [7]:
def combine_comments(short_comments_df, long_comments_df):
    '''
    Join short comments df and long comments df, combine comments of each perfume id
    to a document.

    Parameter:
    ---------
    short_comments_df, long_comments_df

    Return:
    -------
    joined df, two columns, perfume id and all comments
    '''
    long_comments_df.set_index('perfume_id', inplace=True)
    long_comments_df['long_comments'] = long_comments_df['comments'].apply(','.join)
    all_comments = pd.merge(short_comments_df, long_comments_df, how='left', left_index=True, right_index=True)
    all_comments = all_comments.fillna('.')
    all_comments['all_comments'] = all_comments['short_comments'] + all_comments['long_comments']
    all_comments.drop(['comments', 'short_comments', 'long_comments', 'url'], axis=1, inplace=True)
    all_comments = all_comments.reset_index().rename(columns={'index':'perfume_id'})
    return all_comments

In [8]:
client = MongoClient("mongodb://fragrance:fragrance@35.164.86.3:27017/fragrance")
db = client.fragrance
short_ratings = db.short_ratings
short_ratings = pd.DataFrame(list(short_ratings.find({}, {'_id': 0})))
perfume_comments = db.perfume_comments
long_comments = pd.DataFrame(list(perfume_comments.find({}, {'_id': 0})))
client.close()
# Data preprocessing
short_comments_df = short_comments_df(short_ratings)
all_comments_df = combine_comments(short_comments_df, long_comments)

# 2. Process data for TFIDF

In [30]:
def get_corpus(df):
    '''Build corpus from dataframe'''
    corpus = []
    for doc in df['all_comments']:
        corpus.append(doc)
    return corpus

In [10]:
def split_to_words(corpus):
    '''Use jieba to split Chinese text return a list string of words'''
    seg_list = []
    for doc in corpus:
        words = jieba.cut(doc)
        string = " ".join(words)
        seg_list.append(string)
    return seg_list

In [23]:
def get_perfume_stopwords():
    '''Get stopwords file customized for perfume reviews, return a list of words'''
    with io.open('../models/perfume_cn_stopwords.txt', 'r', encoding='utf8') as f:
        stpwdlst = f.read().split()
    return stpwdlst

In [20]:
def get_vectorized_mat(seg_list, use_tfidf, stop_words, max_features=1000):
    '''Get TFIDF or TF matrix from tokenized documents corpus
    If use_tfidf is True --> TFIDF Vectorizer
    If user_tfidf is False --> Count Vectorizer'''
    Vectorizer = TfidfVectorizer if use_tfidf else CountVectorizer
    vectorizer_model = Vectorizer(stop_words=stop_words,
                           analyzer='word',
                           max_features=max_features)
    vec_docs = vectorizer_model.fit_transform(seg_list) # return a sparse matrix
    return vectorizer_model, vec_docs

# 1. Using NMF and LDA in sklearn

In [21]:
def display_topics(model, feature_names, no_top_words):
    '''Display topics generated from NMF and LDA mdoel'''
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [31]:
# Tokenize corpus
stpwdlst = get_perfume_stopwords()
corpus = get_corpus(all_comments_df)
seg_list = split_to_words(corpus)

In [34]:
# NMF is able to use tf-idf, thus fit documents to TFIDF
tfidf_vectorizer, tfidf_docs = get_vectorized_mat(seg_list,
                                                  use_tfidf=True,
                                                  stop_words=stpwdlst,
                                                  max_features=1000)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [38]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model, thus fit to CountVectorizer
countvectorizer, tf_docs = get_vectorized_mat(seg_list,
                                              use_tfidf=False,
                                              stop_words=stpwdlst,
                                              max_features=1000)
tf_feature_names = countvectorizer.get_feature_names()

In [60]:
no_topics = 12
no_top_words = 20

In [61]:
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf_docs)

In [62]:
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf_docs)

In [63]:
print("Topics found by NMF: ")
display_topics(nmf, tfidf_feature_names, no_top_words)

Topics found by NMF: 
Topic 0:
木质 广藿香 香根草 辛辣 气息 胡椒 琥珀 男香 焚香 雪松 檀香 东方 麝香 烟草 温暖 树脂 鸢尾 干燥 香气 肉桂
Topic 1:
玫瑰 玫瑰花 牡丹 沉香 荔枝 广藿香 温柔 乌木 红玫瑰 胡椒 丝带 留香 麝香 精油 藏红花 白玫瑰 花瓣 馥郁 柏林 chloe
Topic 2:
香草 美食 琥珀 巧克力 杏仁 焦糖 奶油 温暖 广藿香 性感 甜腻 甜甜的 檀香 东方 蛋糕 牛奶 甜味 脂粉 浓郁 香甜
Topic 3:
花香 温柔 少女 脂粉 香味 甜美 女人 果香 麝香 特别 花果 香气 优雅 成熟 浓郁 甜甜的 好闻 水果 气息 花香调
Topic 4:
茉莉 茉莉花 橙花 白花 吲哚 铃兰 依兰 白色 茶香 麝香 浓郁 皂感 琥珀 中国 百合 清香 香气 柔和 茶叶 花香
Topic 5:
皮革 烟草 烟熏 动物 气息 乌木 男人 沉香 树脂 藏红花 男性 鸢尾 巴黎 茶香 李子 甜度 质感 鼠尾草 鸢尾花 不错
Topic 6:
柠檬 柑橘 橘子 橙花 清爽 薄荷 古龙水 马鞭草 橙子 罗勒 干净 生姜 皂感 青柠 古龙 简单 清新 茶香 柚子 绿茶
Topic 7:
薰衣草 男香 咖啡 鼠尾草 男士 绅士 海利 辛辣 雪松 豆蔻 柠檬 香调 气息 男人 馥奇 精油 杏仁 薄荷 少年 皂感
Topic 8:
小样 冬日 女性化 成熟 藏红花 中规中矩 迷恋 用过 正装 舒服 香气 重口 浓郁 无感 圆润 失望 第一支 无花果 特别 肥皂水
Topic 9:
清新 夏天 水生 海洋 干净 清爽 绿叶 黄瓜 好闻 西瓜 男香 青草 薄荷 海水 淡淡的 莲花 夏日 雏菊 清淡 海风
Topic 10:
晚香玉 栀子花 白花 栀子 椰子 花香 青绿 依兰 老香 馥郁 浓郁 橙花 质感 女人 吲哚 百合 大气 年代 白色 华丽
Topic 11:
留香 时间 香味 好闻 不错 持久 扩散 特别 夏天 舒服 冬天 很长 清淡 男香 太短 甜甜的 衣服 浓郁 推荐 超级


In [None]:
W = nmf.fit_transform(tfidf_docs)
H = nmf.components_
print 'reconstruction error:', nmf.reconstruction_err_

In [64]:
print("Topics found by LDA: ")
display_topics(lda, tf_feature_names, no_top_words)

Topics found by LDA: 
Topic 0:
香草 紫罗兰 广藿香 杏仁 巧克力 美食 兰花 红毒 奶油 咖啡 粉末 天使 质感 午夜 整体 甜味 柔滑 蛋糕 组合 甜美
Topic 1:
花香 温柔 女人 留香 香味 茉莉 优雅 脂粉 特别 好闻 浓郁 成熟 晚香玉 麝香 清新 不错 白花 栀子花 性感 气息
Topic 2:
柑橘 柠檬 橘子 橙花 琥珀 留香 清新 娇兰 橙子 花香 香味 香草 经典 罗勒 老香 东方 温暖 辛辣 混合 香气
Topic 3:
茉莉 桂花 绿茶 百合 白花 茶香 花香 清新 香气 茶叶 红茶 气息 香味 茉莉花 橙花 留香 茶味 中国 名字 淡雅
Topic 4:
少女 果香 甜美 清新 花果 水果 留香 甜甜的 酸甜 桃子 荔枝 可爱 好闻 醋栗 清甜 活泼 甜蜜 时间 酸酸甜甜 香味
Topic 5:
清新 留香 柠檬 夏天 薄荷 干净 好闻 水生 柑橘 清爽 香味 特别 时间 不错 淡淡的 温柔 花香 麝香 舒服 气息
Topic 6:
玫瑰 红玫瑰 沉香 白玫瑰 少女 广藿香 乌木 香气 温柔 玫瑰花 留香 牡丹 荔枝 清新 香味 气息 柏林 胡椒 陛下 木质
Topic 7:
木质 薰衣草 皮革 男香 香根草 辛辣 胡椒 烟草 温暖 男人 气息 留香 香味 好闻 大地 男士 雪松 特别 不错 温柔
Topic 8:
气息 木质 花香 麝香 鸢尾 焚香 香气 名字 东方 树脂 整体 干燥 香料 粉感 香辛 辛辣 感受 柔和 动物 卤蛋
Topic 9:
香味 檀香 美食 甜腻 留香 温暖 蜂蜜 甜味 甜甜的 甜香 浓郁 水果 焦糖 香草 奶香 冬天 奶油 特别 木质 牛奶
Topic 10:
无花果 清新 花园 青草 气息 花香 绿叶 绿色 绿意 植物 香气 阳光 青绿 留香 地中海 海盐 铃兰 香味 夏天 椰子
Topic 11:
香奈儿 邂逅 小姐 五号 迪奥 香精 甜心 可可 经典 广藿香 街香 花香 花露水 清新 商业 留香 年轻 好闻 新版 广告


In [68]:
lda_left = lda.fit_transform(tf_docs)
# lda_right = lda.components_
# lda_right.shape

In [110]:
# manually label 12 topics generatd from LDA
topic_dict = {0: (u'甜美', u'甜蜜', u'甜味', u'美食', u'香草', u'柔滑'),
              1: (u'温柔', u'优雅', u'成熟', u'女人', u'脂粉', u'性感'),
              2: (u'清新', u'柑橘', u'经典', u'琥珀', u'老香', u'东方调'),
              3: (u'白花系', u'清新', u'淡雅', u'茶香', u'平易近人', u'邻家女孩'),
              4: (u'少女', u'果香', u'甜美', u'可爱', u'活泼', u'甜蜜'),
              5: (u'清新', u'干净', u'夏天', u'清爽', u'舒服', u'清凉'),
              6: (u'玫瑰', u'温柔', u'少女', u'牡丹', u'女人味', u'清新'),
              7: (u'辛辣', u'温暖', u'男人味', u'温柔', u'稳重', u'成熟'),
              8: (u'东方调', u'焚香', u'神秘', u'辛辣', u'深沉', u'柔和'),
              9: (u'美食', u'甜蜜', u'温暖', u'甜味', u'浓郁', u'冬天'),
              10: (u'无花果', u'清新', u'青草', u'绿叶调', u'植物', u'夏天'),
              11: (u'经典', u'大牌', u'奢华', u'广为人知', u'商业香', u'广告多见')}

In [111]:
perfume_kw_dict = {}
for idx, item in enumerate(lda_left):
    perfume_kw_dict[idx] = topic_dict[np.argmax(item)]

In [112]:
# convert dictionary to dataframe for join convenience
perfume_topic_df = pd.DataFrame.from_dict(perfume_kw_dict, orient='index')
perfume_topic_df = perfume_topic_df.fillna(' ')

In [114]:
keywords_matrix = pd.get_dummies(perfume_topic_df.apply(pd.Series).stack()).sum(level=0).rename(columns = lambda x: 'keywords_' + x)

In [123]:
perfume_keywords_df = all_comments_df.join(keywords_matrix)
perfume_keywords_df.drop('all_comments', axis=1, inplace=True)
perfume_keywords_df.set_index('perfume_id', inplace=True)

In [125]:
perfume_keywords_df.to_csv('../data/perfume_keywords_matrix.csv', encoding='utf-8')

In [3]:
perfume_keywords_df = pd.read_csv('../data/perfume_keywords_matrix.csv', encoding='utf-8')

In [12]:
perfume_keywords_df[perfume_keywords_df['perfume_id']==176879]

Unnamed: 0,perfume_id,keywords_东方调,keywords_优雅,keywords_冬天,keywords_可爱,keywords_商业香,keywords_夏天,keywords_大牌,keywords_奢华,keywords_女人,keywords_女人味,keywords_少女,keywords_干净,keywords_平易近人,keywords_广为人知,keywords_广告多见,keywords_性感,keywords_成熟,keywords_无花果,keywords_果香,keywords_柑橘,keywords_柔和,keywords_柔滑,keywords_植物,keywords_活泼,keywords_浓郁,keywords_淡雅,keywords_深沉,keywords_清凉,keywords_清新,keywords_清爽,keywords_温暖,keywords_温柔,keywords_焚香,keywords_牡丹,keywords_玫瑰,keywords_琥珀,keywords_甜味,keywords_甜美,keywords_甜蜜,keywords_男人味,keywords_白花系,keywords_神秘,keywords_稳重,keywords_经典,keywords_绿叶调,keywords_美食,keywords_老香,keywords_脂粉,keywords_舒服,keywords_茶香,keywords_辛辣,keywords_邻家女孩,keywords_青草,keywords_香草
333,176879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0


# 2. Using LDA Model in Gensim

In [17]:
# Importing Gensim
import gensim
from gensim import corpora

Using Theano backend.


In [18]:
doc_clean = [doc.split() for doc in seg_list]  

In [19]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

In [None]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

In [None]:
# print(ldamodel.print_topics(num_topics=10, num_words=10))

# It seems that LDA gives better topics, go with LDA with 12 topics

In [None]:
def hand_label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    hand_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:20]
        print 'topic', i
        print '-->', ' '.join(vocabulary[top_five])
        label = raw_input('please label this topic: ')
        hand_labels.append(label)
        print
    return hand_labels

In [None]:
vocabulary = np.array(tfidf_feature_names)
hand_labels = hand_label_topics(H, vocabulary)

In [None]:
topic_dict = {}
for i, topic in enumerate(hand_labels):
    topic_dict[i] = topic.decode('utf-8')

In [None]:
perfume_topic = {}
for i, row in enumerate(W):
    perfume_topic[i] = topic_dict[np.argsort(row)[-1]]

In [None]:
# convert dictionary to dataframe for join convenience
perfume_topic_df = pd.DataFrame.from_dict(perfume_topic, orient='index')
# change coumn name in perfume_topic_df
perfume_topic_df.rename(columns={0:'keywords'}, inplace=True)

In [None]:
perfume_topic_df

In [None]:
keywords_df = raw_df.join(perfume_topic_df, how='left')
keywords_df.drop(['url'], inplace=True)

In [None]:
keywords_df