## In this notebook, I try to group similar aspects. Similarity is measured using word embeddings. The grouping is done through clustering.

In [1]:
import pandas as pd
import gensim
import re
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch



In [2]:
# Read previously extracted aspects
df_extract = pd.read_csv(r"extract1.csv")
print(len(df_extract))
df_extract.head()

80942


Unnamed: 0,doc_id,aspect,opinion,rel
0,1,odor,soft,amod
1,1,absorption,large,amod
2,1,brands,trustworthy,nsubj
3,1,brands,big,amod
4,1,work,fast,advmod


In [3]:
# Load pretrained word embeddings
en_model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\nlp_resources\word2vec\GoogleNews-vectors-negative300.bin',
                                            binary=True)

In [4]:
print("Vocabulary size:", len(en_model.vocab))
print("Embedding dimension:", en_model.vector_size)

Vocabulary size: 3000000
Embedding dimension: 300


In [5]:
# Check some examples
en_model.most_similar("odor")

[('odors', 0.8496967554092407),
 ('smell', 0.7634782791137695),
 ('stench', 0.6854438781738281),
 ('odor_emanating', 0.6836669445037842),
 ('unpleasant_odor', 0.6794102191925049),
 ('noxious_odor', 0.6737198829650879),
 ('foul_odor', 0.6714236736297607),
 ('smells', 0.6644900441169739),
 ('pungent_odor', 0.6629747152328491),
 ('unpleasant_odors', 0.6477565765380859)]

In [6]:
df_extract[df_extract["aspect"]=="odor"]

Unnamed: 0,doc_id,aspect,opinion,rel
0,1,odor,soft,amod
3244,1329,odor,strong,amod
3245,1329,odor,pungent,amod
6027,2434,odor,pungent,amod
7569,3127,odor,severe,amod
7570,3127,odor,pungent,amod
7571,3127,odor,similar,amod
12630,5188,odor,great,amod
18444,7704,odor,skin-friendly,advmod
23937,10087,odor,breathable,nsubj


In [7]:
df_extract[df_extract["aspect"]=="odors"]

Unnamed: 0,doc_id,aspect,opinion,rel
25319,10728,odors,serious,amod
37900,15997,odors,few,amod
72008,30987,odors,pungent,amod


In [8]:
df_extract[df_extract["aspect"]=="smell"]

Unnamed: 0,doc_id,aspect,opinion,rel
1101,434,smell,good,amod
1197,466,smell,little,amod
1373,527,smell,bad,amod
1657,666,smell,red,nsubj
1658,666,smell,pungent,amod
2504,1021,smell,wet,nsubj
2821,1166,smell,great,amod
2839,1175,smell,big,amod
2840,1175,smell,pungent,amod
2879,1188,smell,just,advmod


In [9]:
emb_vocab = set(en_model.vocab)

In [10]:
# Construct vocabulary
word_pattern = re.compile("^[a-z]{3,}$") # Filter out non-alphabetic words and words with less than 3 characters.
word2index = {}
index2word = {}
index = 0
for word in df_extract["aspect"].unique():
    if word_pattern.match(word) and word in emb_vocab:
        word2index[word] = index
        index2word[index] = word
        index += 1
print(len(word2index))

2021


In [11]:
# Construct feature matrix
word_vectors = []
for i in range(len(index2word)):
    vec = en_model.get_vector(index2word[i])
    word_vectors.append(vec)
word_vectors = np.array(word_vectors)
word_vectors.shape

(2021, 300)

In [12]:
# Export for visualization at https://projector.tensorflow.org/
with open(r"vocab.tsv", "w") as out_f:
    for i in range(len(index2word)):
        out_f.write(index2word[i])
        out_f.write("\n")
np.savetxt(r"vectors.tsv", word_vectors, delimiter="\t")

#### Clustering

In [13]:
birch =Birch(n_clusters=500)
labels = birch.fit_predict(word_vectors)

In [14]:
word_clusters = []
cluster_dict = {}
for i in range(len(index2word)):
    label = labels[i]
    word = index2word[i]
    word_clusters.append([label, word])
    cluster_dict[word] = label
df_word_clusters = pd.DataFrame(word_clusters, columns=["cluster", "word"])
df_word_clusters.head()

Unnamed: 0,cluster,word
0,368,odor
1,46,absorption
2,0,brands
3,98,work
4,56,price


In [15]:
df_cluster_sizes = df_word_clusters.groupby("cluster").count()
print(len(df_cluster_sizes[df_cluster_sizes["word"]>1]))
print(len(df_cluster_sizes[df_cluster_sizes["word"]>2]))

449
320


In [16]:
eval_words = ["price", "absorption", "odor", "package", "brand", 
             "shipping", "elasticity", "padding", "lining", "material"] 
for word in eval_words:
    c = cluster_dict[word]
    print(word, c)
    print(df_word_clusters[df_word_clusters["cluster"]==c]["word"].unique())
    print("-"*100)

price 56
['price' 'prices' 'cost' 'expense' 'spending' 'fee' 'costs']
----------------------------------------------------------------------------------------------------
absorption 46
['absorption' 'enlargement' 'accumulation' 'agglomeration' 'shrinkage']
----------------------------------------------------------------------------------------------------
odor 368
['odor' 'smell' 'scent' 'odors']
----------------------------------------------------------------------------------------------------
package 253
['packs' 'package' 'packages' 'pack' 'kit' 'bundle']
----------------------------------------------------------------------------------------------------
brand 0
['brands' 'product' 'brand' 'products' 'promotion' 'branding' 'marketing'
 'promotions']
----------------------------------------------------------------------------------------------------
shipping 27
['shipment' 'shipments' 'shipping' 'ship' 'port']
-------------------------------------------------------------------------

## Classify sentiment of opinion pairs

In [42]:
from pycorenlp import StanfordCoreNLP
# need to setup the CoreNLP server first
stanford = StanfordCoreNLP('http://localhost:9000')

In [45]:
def get_sentiment(text):
    results = stanford.annotate(text, 
                                properties={'annotators': 'sentiment',
                                            'outputFormat': 'json','timeout': '5000'})
    return results["sentences"][0]["sentiment"]

In [48]:
df_extract["cluster"] = df_extract["aspect"].apply(lambda x: cluster_dict.get(x, -1))
df_extract["pair"] = df_extract["opinion"] + " " + df_extract["aspect"]
df_extract["sentiment"] = df_extract["pair"].apply(get_sentiment)

In [49]:
df_extract["sentiment"].value_counts()

Neutral         49283
Positive        26108
Negative         5463
Verypositive       54
Verynegative       34
Name: sentiment, dtype: int64

In [50]:
def replace_sentiment(s):
    if s == "Verypositive":
        return "Positive"
    if s == "Verynegative":
        return "Negative"
    return s
df_extract["sentiment2"] = df_extract["sentiment"].apply(replace_sentiment)
df_extract["sentiment2"].value_counts()

Neutral     49283
Positive    26162
Negative     5497
Name: sentiment2, dtype: int64

## Summarize opinion by brand

In [51]:
df_reviews = pd.read_csv(r"C:\Dropbox\_projects\PG\ds-nlp-interview-question_v2.csv")
df_reviews["id"] = list(range(len(df_reviews)))
df_reviews.head(2)

Unnamed: 0,REVIEW_TEXT_CN,REVIEW_TEXT_EN,ONLINE_STORE,BRAND,YEAR,MONTH,id
0,花王的确实不错，一直用这个牌子,"Kao is really good, always use this brand",tmall,Merries,2016,11,0
1,哈哈哈哈，真心不错的啊。没有异味，非常非常的柔软，吸收量很大，并且不起球，大品牌值得信赖，物...,"Hahahaha, really good. No odor, very very soft...",suning,Huggies,2017,1,1


In [52]:
df_reviews["BRAND"].value_counts()

Huggies    13339
Pampers    11351
Merries    11310
Name: BRAND, dtype: int64

In [53]:
brand_dict = {}
for i, row in df_reviews.iterrows():
    brand_dict[row["id"]] = row["BRAND"]

In [54]:
df_extract["brand"] = df_extract["doc_id"].apply(lambda x: brand_dict[x])
df_extract.head()

Unnamed: 0,doc_id,aspect,opinion,rel,brand,cluster,pair,sentiment,sentiment2
0,1,odor,soft,amod,Huggies,368,soft odor,Neutral,Neutral
1,1,absorption,large,amod,Huggies,46,large absorption,Neutral,Neutral
2,1,brands,trustworthy,nsubj,Huggies,0,trustworthy brands,Neutral,Neutral
3,1,brands,big,amod,Huggies,0,big brands,Neutral,Neutral
4,1,work,fast,advmod,Huggies,98,fast work,Neutral,Neutral


In [55]:
brands = df_extract["brand"].unique()

In [63]:
eval_words = ["price", "absorption", "odor", "package", "brand", 
             "shipping", "elasticity", "padding", "lining", "material"] 
for word in eval_words:
    c = cluster_dict[word]
    print("Query word:",word, "\tWord cluster:", c)
    print("Aspects:", df_word_clusters[df_word_clusters["cluster"]==c]["word"].unique())
    for sentiment in ["Positive", "Neutral", "Negative"]:
        df_temp = df_extract[(df_extract["cluster"]==c)&(df_extract["sentiment2"]==sentiment)]
        df_crosstab = pd.crosstab(df_temp["pair"], df_temp["brand"])
        df_crosstab["Total"] = df_crosstab.sum(axis=1)
        df_crosstab = df_crosstab.sort_values("Total", ascending=False)
        print("\n*"+sentiment+"*"+"-"*40)
        print(df_crosstab[:5])
    print("="*100)

Query word: price 	Word cluster: 56
Aspects: ['price' 'prices' 'cost' 'expense' 'spending' 'fee' 'costs']

*Positive*----------------------------------------
brand             Huggies  Merries  Pampers  Total
pair                                              
affordable price       92       51       71    214
good price             64       66       73    203
cheaper price          59       51       54    164
expensive price        58       47       48    153
high price             61       31       55    147

*Neutral*----------------------------------------
brand              Huggies  Merries  Pampers  Total
pair                                               
high cost                4        5        7     16
better price             5        4        2     11
affordable prices        2        3        0      5
high prices              3        2        0      5
curious price            3        0        0      3

*Negative*----------------------------------------
brand         Hugg

It seems that Pampers has a stronger smell.

In [64]:
df_extract.to_csv(r"extract1_clustered.csv", index=False)