In [1]:
import numpy as np
import pandas as pd
import io
import jieba
import jieba.posseg as pseg 
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from pymongo import MongoClient
from pandas import DataFrame
from scipy.spatial.distance import pdist, squareform 
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import KMeans

pd.options.display.max_seq_items = 2000
pd.set_option('display.max_colwidth', 30)

# Short comments

In [2]:
client = MongoClient("mongodb://fragrance:fragrance@35.164.86.3:27017/fragrance")
db = client.fragrance
collection = db.short_ratings
short_ratings = pd.DataFrame(list(collection.find({}, {'_id': 0}))) # not including _id column
client.close()

In [3]:
short_ratings.groupby('perfume_id')['user_rating'].count().sort_values(ascending=False)[0:10]
# 1. Elizabeth Arden Green Tea; 
# 2. Jo Malone Wood Sage & Sea Salt; 
# 3. Jo Malone English Pear & Freesia; 
# 4. Hermes Un Jardin Sur Le Nil, 2005
# 5. Serge Lutens L`orpheline, 2014

perfume_id
370823    174
642827    168
526205    166
251428    166
319024    166
237782    166
591443    164
666243    164
315972    164
766253    161
Name: user_rating, dtype: int64

In [4]:
scomments = defaultdict(list)
for pid in short_ratings['perfume_id'].unique():
    df = short_ratings[(short_ratings['perfume_id'] == pid)]
    for c in df['short_comment']:
        scomments[pid].append(c)

In [5]:
scomments_df = pd.DataFrame.from_dict(scomments, orient='index').stack().sum(level=0)

In [6]:
short_comments = pd.DataFrame(scomments_df).rename(columns={0:'short_comments'})

In [7]:
short_comments.head()

Unnamed: 0,short_comments
886259,意外很fuwafuwa~淡淡的甜，像美梦的枕头。
912336,感觉更适用作女香薰衣草 类似桀骜有点像香奈儿的All...
912337,求购我姐同学送她的 然后她一直寄存在我这儿（学霸不用...
249751,真的是粉色的感觉，浪漫甜美，不错
467545,在途到手写，两瓶黄金买到吐血


# Long Comment Articles

In [8]:
client = MongoClient("mongodb://fragrance:fragrance@35.164.86.3:27017/fragrance")
db = client.fragrance
collection = db.perfume_comments
raw_df = pd.DataFrame(list(collection.find({}, {'_id': 0}))) # not including _id column
client.close()

In [9]:
raw_df.head()

Unnamed: 0,comments,perfume_id,url
0,[想谈香水这个话题，是因为两件事情。一件是前段时间去...,251428,
1,[使用香水画面： 在一间专卖老旧精品小饰品、小玩具...,590187,/xiangshui/590187-yishibei...
2,[官方提供小样试闻报告：开篇就是摩卡咖啡的甜却不腻，...,506342,/xiangshui/506342-yishibei...
3,[官方提供小样试闻报告：官方提供的五款试闻小样里，这...,482456,/xiangshui/482456-yishibei...
4,[开头香柠檬的酸涩和粉红胡椒的辛辣搭配的比较和谐，构...,331859,/xiangshui/331859-yishibei...


In [10]:
raw_df.set_index('perfume_id', inplace=True)

In [11]:
raw_df.head()

Unnamed: 0_level_0,comments,url
perfume_id,Unnamed: 1_level_1,Unnamed: 2_level_1
251428,[想谈香水这个话题，是因为两件事情。一件是前段时间去...,
590187,[使用香水画面： 在一间专卖老旧精品小饰品、小玩具...,/xiangshui/590187-yishibei...
506342,[官方提供小样试闻报告：开篇就是摩卡咖啡的甜却不腻，...,/xiangshui/506342-yishibei...
482456,[官方提供小样试闻报告：官方提供的五款试闻小样里，这...,/xiangshui/482456-yishibei...
331859,[开头香柠檬的酸涩和粉红胡椒的辛辣搭配的比较和谐，构...,/xiangshui/331859-yishibei...


In [12]:
raw_df['long_comments'] = raw_df['comments'].apply(','.join)

In [13]:
raw_df.head()

Unnamed: 0_level_0,comments,url,long_comments
perfume_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
251428,[想谈香水这个话题，是因为两件事情。一件是前段时间去...,,想谈香水这个话题，是因为两件事情。一件是前段时间去S...
590187,[使用香水画面： 在一间专卖老旧精品小饰品、小玩具...,/xiangshui/590187-yishibei...,使用香水画面： 在一间专卖老旧精品小饰品、小玩具的...
506342,[官方提供小样试闻报告：开篇就是摩卡咖啡的甜却不腻，...,/xiangshui/506342-yishibei...,官方提供小样试闻报告：开篇就是摩卡咖啡的甜却不腻，一...
482456,[官方提供小样试闻报告：官方提供的五款试闻小样里，这...,/xiangshui/482456-yishibei...,官方提供小样试闻报告：官方提供的五款试闻小样里，这个...
331859,[开头香柠檬的酸涩和粉红胡椒的辛辣搭配的比较和谐，构...,/xiangshui/331859-yishibei...,开头香柠檬的酸涩和粉红胡椒的辛辣搭配的比较和谐，构成...


In [14]:
all_comments = pd.merge(short_comments, raw_df, how='left', left_index=True, right_index=True)

In [15]:
all_comments['all_comments'] = all_comments['short_comments'] + all_comments['long_comments']

In [16]:
all_comments = all_comments.fillna('.')

In [17]:
all_comments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3842 entries, 100077 to 999313
Data columns (total 5 columns):
short_comments    3842 non-null object
comments          3842 non-null object
url               3842 non-null object
long_comments     3842 non-null object
all_comments      3842 non-null object
dtypes: object(5)
memory usage: 180.1+ KB


In [18]:
all_comments.head()

Unnamed: 0,short_comments,comments,url,long_comments,all_comments
100077,“她让我可以走在北京地铁换乘站的人群里。。不因为渺小...,[Narciso Rodriguez for Her...,/xiangshui/100077-naqiansu...,Narciso Rodriguez for Her ...,“她让我可以走在北京地铁换乘站的人群里。。不因为渺小...
100230,就是一颗大橙子，味道蛮简单的，有点像橙子味的芬达,[中味的橙花只是标志性的提下，感觉完全可以一并扔进前...,/xiangshui/100230-panhaili...,中味的橙花只是标志性的提下，感觉完全可以一并扔进前味...,就是一颗大橙子，味道蛮简单的，有点像橙子味的芬达中味...
100401,让人想起Dior 红毒…油腻腻甜得整个人飞起来了！想...,[头几秒和JM家的黑莓月桂好相似，中后调的味道很像去...,/xiangshui/100401-salvator...,头几秒和JM家的黑莓月桂好相似，中后调的味道很像去宜...,让人想起Dior 红毒…油腻腻甜得整个人飞起来了！想...
101105,很浓郁 留香不错 但是缺少温柔 ，但是不是感觉中的晚...,[粉粉的，有点清甜的脂粉花香。没有同名那么与众不同，...,/xiangshui/101105-stella-m...,粉粉的，有点清甜的脂粉花香。没有同名那么与众不同，但...,很浓郁 留香不错 但是缺少温柔 ，但是不是感觉中的晚...
101481,甜甜的想试试,.,.,.,.


In [23]:
# all_comments.reset_index().rename(columns={'index':'perfume_id'})

In [20]:
# all_comments.drop(all_comments[['short_comments', 'long_comments', 'url']], axis=1, inplace=True)

In [21]:
all_comments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3842 entries, 100077 to 999313
Data columns (total 5 columns):
short_comments    3842 non-null object
comments          3842 non-null object
url               3842 non-null object
long_comments     3842 non-null object
all_comments      3842 non-null object
dtypes: object(5)
memory usage: 180.1+ KB


In [24]:
# all_comments = pd.read_csv('../data/all_comments.csv', encoding='utf-8', index_col=0)
# all_comments.rename(columns={' ': 'perfume_id'})
# all_comments.head()

In [None]:
# build corpus
documents = []
for doc in raw_df['comments']:
    documents.append(doc[0])

In [None]:
all_comments.head()

In [None]:
df = raw_df[(raw_df['perfume_id'] == '642827')]
df

In [None]:
# try jieba to split Chinese text
seg_list = []
for doc in documents:
    words = jieba.cut(doc)
    string = " ".join(words)
    seg_list.append(string)

In [None]:
with io.open('models/chinese_stopwords.txt', 'r', encoding='utf8') as f:
     stpwdlst = f.read().split()

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words= stpwdlst,
                                   analyzer= 'word',
                                   max_features=300)

In [None]:
tfidf_docs = tfidf_vectorizer.fit_transform(seg_list)

In [None]:
feature_names = np.array(tfidf_vectorizer.get_feature_names())
print("词语序列：")
print repr(feature_names).decode('unicode_escape')
print("TF IDF Vector：")
print(tfidf_docs.toarray())

In [None]:
def find_top_features(k_features, tfidf_mat):
    '''
    Find top k features in each perfume

    Parameters:
    -----------
    1. number of features for each perfume
    2. TFIDF matrix converted from sparse matrix to 2d numpy array
    '''
    top_features_idx = np.empty([tfidf_mat.shape[0], k_features], dtype=int)
    top_features = np.empty([tfidf_mat.shape[0], k_features], dtype=object)
    for i, row in enumerate(tfidf_mat):
        top_features_idx[i] = np.argsort(row)[::-1][:k_features]
        top_features[i] = feature_names[top_features_idx[i]]
    return top_features, top_features_idx

In [None]:
top_features, top_features_idx = find_top_features(20, tfidf_docs.toarray())

# I extracted the key features from comments! YEAH!

In [None]:
key_features = pd.read_csv('data/perfume_key_features.csv')

In [None]:
key_features.drop('Unnamed: 0', axis=1, inplace=True)
key_features.set_index('perfume_id', inplace=True)

In [None]:
perfume_info = pd.read_csv('data/perfume_features.csv')
perfume_info.set_index('perfume_id', inplace=True)

In [None]:
joined_df = key_features.join(perfume_info, how='left')
joined_df.drop(['Unnamed: 0', 'url'], axis=1, inplace=True)

In [None]:
joined_df['keywords'] = joined_df[joined_df.columns[0:15]].apply(lambda x: ','.join(x),axis=1)
joined_df.drop(['0','1','2','3', '4','5','6','7','8','9','10','11',
                '12','13','14'], axis=1, inplace=True)

In [None]:
# create another dataframe used for keyword analysis
kw_df = joined_df[['brand', 'item_name', 'gender', 'note', 'keywords']]

In [None]:
# check comment keywords by brand
kw_df[kw_df['brand'] == '蒂普提克']

In [None]:
kw_df.info()

In [None]:
# check comment keywords by brand
kw_df[kw_df['gender'] == '中性香']