In [266]:
import numpy as np
import pandas as pd
import io
import jieba
import jieba.posseg as pseg 
from sklearn.feature_extraction.text import TfidfVectorizer
from pymongo import MongoClient
from pandas import DataFrame
from scipy.spatial.distance import pdist, squareform 
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import KMeans

pd.options.display.max_seq_items = 2000

In [4]:
client = MongoClient("mongodb://fragrance:fragrance@35.164.86.3:27017/fragrance")
db = client.fragrance
collection = db.perfume_comments
raw_df = pd.DataFrame(list(collection.find({}, {'_id': 0}))) # not including _id column
client.close()

In [5]:
raw_df.head()

Unnamed: 0,comments,perfume_id,url
0,[想谈香水这个话题，是因为两件事情。一件是前段时间去Sephora无意间看中一款Hermes...,251428,
1,[使用香水画面： 在一间专卖老旧精品小饰品、小玩具的小卖店，屋内挂满各种项链饰品吊具，一位...,590187,/xiangshui/590187-yishibeige-eisenberg-diaboli...
2,[官方提供小样试闻报告：开篇就是摩卡咖啡的甜却不腻，一种凉意贯穿前后，薄荷竟然处理的这样好（...,506342,/xiangshui/506342-yishibeige-eisenberg-j-ose.html
3,[官方提供小样试闻报告：官方提供的五款试闻小样里，这个可以排在我第二喜欢（第一喜欢居然是男香...,482456,/xiangshui/482456-yishibeige-eisenberg-i-am.html
4,[开头香柠檬的酸涩和粉红胡椒的辛辣搭配的比较和谐，构成了一种很特别的清新感，闻着令人精神为之...,331859,/xiangshui/331859-yishibeige-eisenberg-tentati...


In [53]:
# build corpus
documents = []
for doc in raw_df['comments']:
    documents.append(doc[0])

print documents



In [64]:
# try jieba to split Chinese text
seg_list = []
for doc in documents:
    words = jieba.cut(doc)
    string = " ".join(words)
    seg_list.append(string)

In [149]:
with io.open('models/chinese_stopwords.txt', 'r', encoding='utf8') as f:
     stpwdlst = f.read().split()

In [142]:
tfidf_vectorizer = TfidfVectorizer(stop_words= stpwdlst,
                                   analyzer= 'word',
                                   max_features=300)

In [143]:
tfidf_docs = tfidf_vectorizer.fit_transform(seg_list)

In [144]:
feature_names = np.array(tfidf_vectorizer.get_feature_names())
print("词语序列：")
print repr(feature_names).decode('unicode_escape')
print("TF IDF Vector：")
print(tfidf_docs.toarray())

词语序列：
array([u'angel', u'de', u'edp', u'edt', u'the', u'一下',
       u'一丝', u'一个', u'一只', u'一定',
       u'一支', u'一次', u'一款', u'一点',
       u'一点点', u'一瓶', u'一直',
       u'一种', u'一股', u'一起', u'不会',
       u'不到', u'不同', u'不少', u'不算',
       u'不能', u'不要', u'不错', u'东方',
       u'东西', u'个人', u'个性', u'中性',
       u'中调', u'中间', u'为主', u'主题',
       u'之前', u'之后', u'也许', u'令人',
       u'以后', u'价格', u'仿佛', u'优雅',
       u'似乎', u'作品', u'使用', u'依旧',
       u'依然', u'值得', u'像是', u'元素',
       u'充满', u'其实', u'具有', u'几乎',
       u'出现', u'刺激', u'前调', u'加上',
       u'加入', u'动物', u'十分', u'印象',
       u'厚重', u'原版', u'发现', u'变化',
       u'变得', u'变成', u'古龙水',
       u'只能', u'可能', u'同名', u'同样',
       u'名字', u'后来', u'后调', u'味儿',
       u'味道', u'品牌', u'喜欢', u'基本',
       u'基调', u'夏天', u'大概', u'女人',
       u'女性', u'好像', u'好闻', u'始终',
       u'存在', u'完全', u'完美', u'实在',
       u'容易', u'小时', u'少女', u'尝试',
       u'尤其', u'尾调', u'属于', u'左右',
       u'巧克力', u'已经', u'带有',
       u'带来', u'干净', u'干燥', u'年轻',
       u'广藿香

In [140]:
def find_top_features(k_features, tfidf_mat):
    '''
    Find top k features in each perfume

    Parameters:
    -----------
    1. number of features for each perfume
    2. TFIDF matrix converted from sparse matrix to 2d numpy array
    '''
    top_features_idx = np.empty([tfidf_mat.shape[0], k_features], dtype=int)
    top_features = np.empty([tfidf_mat.shape[0], k_features], dtype=object)
    for i, row in enumerate(tfidf_mat):
        top_features_idx[i] = np.argsort(row)[::-1][:k_features]
        top_features[i] = feature_names[top_features_idx[i]]
    return top_features, top_features_idx

In [145]:
top_features, top_features_idx = find_top_features(20, tfidf_docs.toarray())

scipy.sparse.csr.csr_matrix

# I extracted the key features from comments! YEAH!!!

In [258]:
key_features = pd.read_csv('data/perfume_key_features.csv')

In [259]:
key_features.drop('Unnamed: 0', axis=1, inplace=True)
key_features.set_index('perfume_id', inplace=True)

In [260]:
perfume_info = pd.read_csv('data/perfume_features.csv')
perfume_info.set_index('perfume_id', inplace=True)

In [261]:
joined_df = key_features.join(perfume_info, how='left')
joined_df.drop(['Unnamed: 0', 'url'], axis=1, inplace=True)

In [262]:
joined_df['keywords'] = joined_df[joined_df.columns[0:15]].apply(lambda x: ','.join(x),axis=1)
joined_df.drop(['0','1','2','3', '4','5','6','7','8','9','10','11',
                '12','13','14'], axis=1, inplace=True)

In [263]:
# create another dataframe used for keyword analysis
kw_df = joined_df[['brand', 'item_name', 'gender', 'note', 'keywords']]

In [264]:
# check comment keywords by brand
kw_df[kw_df['brand'] == '蒂普提克']

Unnamed: 0_level_0,brand,item_name,gender,note,keywords
perfume_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
133767,蒂普提克,"蒂普提克 水中倒影 Diptyque L'Eau, 1968",中性香,"[肉桂, 玫瑰, 老鹳草, 檀香木, 公丁香]","香气,余味,隐约,安静,奇怪,淡雅,浓烈,名字,花香,微微,微妙,微弱,龙涎香,很长,微甜"
142570,蒂普提克,"蒂普提克 清晨百合 Diptyque Olene, 1988",女香,"[水仙花, 忍冬, 紫藤, 茉莉, 绿叶, 白色花系]","水仙,吲哚,茉莉,百合,白花,忍冬,气息,调香,简单,香气,敏感,这一,与众不同,清冽,隐藏"
157932,蒂普提克,"蒂普提克 番红花玫瑰 Diptyque Ôponé, 2001",中性香,"[玫瑰, 藏红花, 辛香料, 愈创木, 孜然]","玫瑰,沙龙,香辛料,出色,多年,沉静,沉香,牡丹,停产,馥郁,干燥,东方,胡椒,香调,木质"
203283,蒂普提克,"蒂普提克 薰衣草之水 Diptyque Eau de Lavande, 2014",中性香,"[薰衣草, 芫荽, 肉桂, 肉豆蔻]","薰衣草,严肃,沉重,木质,肉桂,檀木,芳香,深沉,搭配,瞬间,渐渐,皮革,辛辣,混合,香气"
205427,蒂普提克,"蒂普提克 香醋微醺 Diptyque Vinaigre de Toilette, 1975",中性香,"[绿叶, 橙花油, 辛香料, 焚香, 木质香]","微醺,香气,小众,宜人,意大利,纯正,清雅,暖意,缓缓,辛香料,葡萄柚,醇厚,蜂蜜,淡淡的,甜味"
206507,蒂普提克,"蒂普提克 34号之水 Diptyque Eau de 34, 2013",中性香,"[苦橙, 柠檬, 柠檬马鞭草, 葡萄柚, 薰衣草, 杜松子, 肉豆蔻, 肉桂, 桦木, 老鹳...","脂粉气,自信,干净,清爽,龙涎香,快乐,忧郁,忍冬,微风,微醺,微酸,微苦,微甜,微微,微弱"
218467,蒂普提克,"蒂普提克 杜耶尔 Diptyque Eau Duelle, 2010",中性香,"[小豆蔻, 榄香脂, 乳香, 香柠檬, 粉红胡椒, 杜松, 藏红花, 茶叶, 麝香, 龙涎香...","开心,简单,绿意,檀香,白花,温柔,温暖,香草,by,灰色,暗黑,迷恋,轻快,少年,动人"
237755,蒂普提克,"蒂普提克 青藤玫瑰 Diptyque Eau Plurielle, 2015",中性香,"[柑橘, 木质香, 麝香]","青涩,年代,玫瑰,美好,成熟,酸涩,荔枝,诱惑,绽放,妩媚,青绿,完美,充满,柑橘,微微"
240362,蒂普提克,"蒂普提克 檀道（檀香） Diptyque Tam Dao, 2003",中性香,"[玫瑰, 桃金娘, 柏树, 檀香木, 雪松, 辛香料, 琥珀, 麝香, 巴西红木]","这瓶香,高级,多年,扩散性,檀香,白天,试用,开心,自信,试香,奶油,沙龙,作用,烟熏,柔滑"
263965,蒂普提克,"蒂普提克 天竺葵之水 Diptyque Geranium odorata, 2014",中性香,"[香柠檬, 老鹳草, 粉红胡椒, 零陵香豆, 香根草, 雪松]","天竺葵,清新,香味,辛辣,温柔,很小,取代,幻想,玫瑰,叶子,略显,巴黎,粉色,柑橘类,妹子"


In [265]:
kw_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3324 entries, 100077 to 999313
Data columns (total 5 columns):
brand        3324 non-null object
item_name    3324 non-null object
gender       3323 non-null object
note         3296 non-null object
keywords     3324 non-null object
dtypes: object(5)
memory usage: 155.8+ KB


In [279]:
# check comment keywords by brand
kw_df[kw_df['gender'] == '中性香']

Unnamed: 0_level_0,brand,item_name,gender,note,keywords
perfume_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100230,潘海利根,"潘海利根 古龙水 Penhaligon`s Eau de Cologne, 1927",中性香,"[橙子, 柠檬, 香柠檬, 迷迭香, 橙花油]","延续,橙花,上海,前味,沐浴,柑橘类,优秀,白麝香,小时候,酸味,花香,明亮,淡雅,橘子,皂感"
102509,克利安,"克利安 冰镇伏加特 By Kilian Vodka on the Rocks, 2014",中性香,"[小豆蔻, 芫荽, 铃兰, 大黄, 玫瑰, 橡木苔, 降龙涎香醚, 檀香木]","质感,by,高端,酒香,奢华,铃兰,常见,淡淡的,自然,清爽,微醺,微妙,微酸,微苦,微甜"
105452,鲁宾,"鲁宾 艾多淡香精 Lubin Idole Eau de Parfum, 2011",中性香,"[藏红花, 苦橙, 葛缕子, 朗姆酒, 木质香, 黑檀, 乳香, 焚香, 糖, 棕榈叶, 劳...","香辛,甘甜,木质,柔和,调香,皮革,孜然,麝香,轻度,靠前,熏感,硬朗,余味,收尾,平和"
106032,芦丹氏,"芦丹氏 琥珀君王（橙色苏丹） Serge Lutens Ambre Sultan, 2000",中性香,"[芫荽, 檀香木, 月桂叶, 广藿香, 当归, 树脂, 没药, 琥珀, 牛至叶, 桃金娘, ...","中药,琥珀,性感,恶心,迷恋,药味,越发,情怀,著名,灵魂,霸气,诞生,深刻,液体,没药"
106257,芦丹氏,"芦丹氏 清色麝香 Serge Lutens Clair de Musc, 2003",中性香,"[鸢尾花, 麝香, 橙花油, 香柠檬]","花园里,橙花油,隐藏,独立,清冷,鸢尾,少女,柠檬,麝香,花香,微酸,微苦,很长,微微,微醺"
106394,欧梦德·杰尼,"欧梦德·杰尼 黄兰花 Ormonde Jayne Champaca, 2002",中性香,"[粉红胡椒, 竹子, 橙花油, 黄兰, 小苍兰, 大米, 没药, 麝香, 茶叶]","甜美,小花,动人,奶油,层次,美妙,搭配,橙花,甜腻,渐渐,简单,质感,温暖,清新,很小"
107273,气味图书馆,"气味图书馆 野生红罂粟花 Demeter Fragrance Red Poppies, 2009",中性香,[罂粟花],"新鲜,中规中矩,当初,酸酸甜甜,水汽,人生,花瓣,可爱,微微,微甜,微苦,微酸,微妙,微醺,微风"
107280,圣玛利亚修道院,"圣玛利亚修道院 岩滩海风 Santa Maria Novella Cala Rossa, 2014",中性香,"[薰衣草, 薄荷, 尤加利, 不凋花, 小茴香, 劳丹脂]","海风,白麝香,水汽,甜润,温和,薄荷,皂感,树脂,木质,清新,微醺,微酸,微风,微苦,微甜"
107376,卡地亚,卡地亚 橙香精粹淡香水喷雾 Cartier Eau de Cartier Essence d...,中性香,"[香柠檬, 橙子, 紫罗兰, 广藿香, 雪松]","橙子,酸味,散发,水果,浓郁,柠檬,清新,留香,微风,忍冬,微醺,忧郁,微妙,微酸,微苦"
107945,奥丁,"奥丁 07塔诺克 Odin 07 Tanoke, 2011",中性香,"[生姜, 苦橙, 胡椒, 愈创木, 焚香, 肉豆蔻, 桃花心木, 广藿香, 麝香]","冬天,焚香,干燥,胡椒,广藿香,辛辣,温暖,麝香,木质,微风,忍冬,忧郁,快乐,微醺,微酸"
