In [1]:
import numpy as np
import pandas as pd
import io
import jieba
import jieba.posseg as pseg 
from sklearn.feature_extraction.text import TfidfVectorizer
from pymongo import MongoClient
from pandas import DataFrame
from scipy.spatial.distance import pdist, squareform 
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import KMeans

pd.options.display.max_seq_items = 2000

In [2]:
key_features = pd.read_csv('data/perfume_key_features.csv')

In [3]:
key_features.drop('Unnamed: 0', axis=1, inplace=True)
key_features.set_index('perfume_id', inplace=True)

In [4]:
perfume_info = pd.read_csv('data/perfume_features.csv')
perfume_info.set_index('perfume_id', inplace=True)

In [5]:
joined_df = key_features.join(perfume_info, how='left')
joined_df.drop(['Unnamed: 0', 'url'], axis=1, inplace=True)

In [6]:
joined_df['keywords'] = joined_df[joined_df.columns[0:15]].apply(lambda x: ','.join(x),axis=1)
joined_df.drop(['0','1','2','3', '4','5','6','7','8','9','10','11',
                '12','13','14'], axis=1, inplace=True)

In [7]:
# create another dataframe used for keyword analysis
kw_df = joined_df[['brand', 'item_name', 'gender', 'note', 'keywords']]

In [8]:
# check comment keywords by brand
kw_df[kw_df['brand'] == '蒂普提克']

Unnamed: 0_level_0,brand,item_name,gender,note,keywords
perfume_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
133767,蒂普提克,"蒂普提克 水中倒影 Diptyque L'Eau, 1968",中性香,"[肉桂, 玫瑰, 老鹳草, 檀香木, 公丁香]","香气,余味,隐约,安静,奇怪,淡雅,浓烈,名字,花香,微微,微妙,微弱,龙涎香,很长,微甜"
142570,蒂普提克,"蒂普提克 清晨百合 Diptyque Olene, 1988",女香,"[水仙花, 忍冬, 紫藤, 茉莉, 绿叶, 白色花系]","水仙,吲哚,茉莉,百合,白花,忍冬,气息,调香,简单,香气,敏感,这一,与众不同,清冽,隐藏"
157932,蒂普提克,"蒂普提克 番红花玫瑰 Diptyque Ôponé, 2001",中性香,"[玫瑰, 藏红花, 辛香料, 愈创木, 孜然]","玫瑰,沙龙,香辛料,出色,多年,沉静,沉香,牡丹,停产,馥郁,干燥,东方,胡椒,香调,木质"
203283,蒂普提克,"蒂普提克 薰衣草之水 Diptyque Eau de Lavande, 2014",中性香,"[薰衣草, 芫荽, 肉桂, 肉豆蔻]","薰衣草,严肃,沉重,木质,肉桂,檀木,芳香,深沉,搭配,瞬间,渐渐,皮革,辛辣,混合,香气"
205427,蒂普提克,"蒂普提克 香醋微醺 Diptyque Vinaigre de Toilette, 1975",中性香,"[绿叶, 橙花油, 辛香料, 焚香, 木质香]","微醺,香气,小众,宜人,意大利,纯正,清雅,暖意,缓缓,辛香料,葡萄柚,醇厚,蜂蜜,淡淡的,甜味"
206507,蒂普提克,"蒂普提克 34号之水 Diptyque Eau de 34, 2013",中性香,"[苦橙, 柠檬, 柠檬马鞭草, 葡萄柚, 薰衣草, 杜松子, 肉豆蔻, 肉桂, 桦木, 老鹳...","脂粉气,自信,干净,清爽,龙涎香,快乐,忧郁,忍冬,微风,微醺,微酸,微苦,微甜,微微,微弱"
218467,蒂普提克,"蒂普提克 杜耶尔 Diptyque Eau Duelle, 2010",中性香,"[小豆蔻, 榄香脂, 乳香, 香柠檬, 粉红胡椒, 杜松, 藏红花, 茶叶, 麝香, 龙涎香...","开心,简单,绿意,檀香,白花,温柔,温暖,香草,by,灰色,暗黑,迷恋,轻快,少年,动人"
237755,蒂普提克,"蒂普提克 青藤玫瑰 Diptyque Eau Plurielle, 2015",中性香,"[柑橘, 木质香, 麝香]","青涩,年代,玫瑰,美好,成熟,酸涩,荔枝,诱惑,绽放,妩媚,青绿,完美,充满,柑橘,微微"
240362,蒂普提克,"蒂普提克 檀道（檀香） Diptyque Tam Dao, 2003",中性香,"[玫瑰, 桃金娘, 柏树, 檀香木, 雪松, 辛香料, 琥珀, 麝香, 巴西红木]","这瓶香,高级,多年,扩散性,檀香,白天,试用,开心,自信,试香,奶油,沙龙,作用,烟熏,柔滑"
263965,蒂普提克,"蒂普提克 天竺葵之水 Diptyque Geranium odorata, 2014",中性香,"[香柠檬, 老鹳草, 粉红胡椒, 零陵香豆, 香根草, 雪松]","天竺葵,清新,香味,辛辣,温柔,很小,取代,幻想,玫瑰,叶子,略显,巴黎,粉色,柑橘类,妹子"


# Hierarchical Clustering

In [9]:
# convert to matrix
keywords = kw_df.as_matrix()

In [10]:
keywords

array([['纳茜素', '纳茜素 同名女士淡香水 Narciso Rodriguez For Her, 2003', '女香',
        '[桂花, 橙花, 香柠檬, 麝香, 琥珀, 香根草, 香草, 广藿香]',
        '香味,干净,麝香,简约,女性化,肥皂,桂花,隐约,散发出,很长,愉悦,魅力,尖锐,舒服,选择'],
       ['潘海利根', '潘海利根 古龙水 Penhaligon`s Eau de Cologne, 1927', '中性香',
        '[橙子, 柠檬, 香柠檬, 迷迭香, 橙花油]',
        '延续,橙花,上海,前味,沐浴,柑橘类,优秀,白麝香,小时候,酸味,花香,明亮,淡雅,橘子,皂感'],
       ['菲拉格慕',
        '菲拉格慕 芭蕾女伶神秘版 Salvatore Ferragamo Signorina Misteriosa, 2016',
        '女香', '[黑莓, 橙花油, 晚香玉, 橙花, 广藿香, 牛奶慕斯, 香草]',
        '龙涎香,得体,性别,性价比,快乐,忧郁,忍冬,微风,微醺,微酸,微苦,微甜,微微,微弱,微妙'],
       ..., 
       ['雨果波士', '雨果波士 光彩女人 Hugo Boss Femme, 2006', '女香',
        '[黑加仑, 小苍兰, 蜜橘, 百合, 茉莉, 玫瑰, 杏, 琥珀, 柠檬树]',
        '柔美,霸气,舒服,气质,女人,名字,特别,时间,留香,龙涎香,微弱,微妙,微甜,微微,微苦'],
       ['意大利之水', "意大利之水 同名 Eau D`Italie Eau D'Italie", '中性香',
        '[焚香, 香柠檬, 黑加仑, 粘土, 木兰, 晚香玉, 琥珀, 广藿香, 三叶草, 麝香]',
        '焚香,厉害,宜人,女士,年龄,沉静,细腻,男士,活泼,晚香玉,经典,成熟,白花,微酸,得体'],
       ['桃丝熊', '桃丝熊 同名女士 Tous, 2002', '女香',
        '[芫荽, 醋栗叶, 香柠檬, 紫罗兰叶, 牡丹, 茉莉, 栀子花, 玫瑰, 鸢尾花, 雪松, 麝香]',
    

In [11]:
def get_words_idx(tfidf):
    most_present_index = np.array([tfidf[i].argsort()[-10:][::-1] for i in xrange(tfidf.shape[0])])
    return most_present_index

In [None]:
def get_words(indices):
    words_list = []
    for row in xrange(len(indices)):
        words_list.append([feature_names[i] for i in indices[row]])
    return words_list

In [12]:
with io.open('models/chinese_stopwords.txt', 'r', encoding='utf8') as f:
     stpwdlst = f.read().split()

In [13]:
tfidf_vectorizer = TfidfVectorizer(stop_words= stpwdlst,
                                   analyzer= 'word',
                                   max_features=300)