In [26]:
import pandas as pd
import ast

# 无监督聚类

In [27]:
def get_df(input_file):
    df = pd.read_csv(input_file)
    res = []
    for i in range(len(df)):
        x = ast.literal_eval(df['label'][i])
        for i in x:
            res.append(i)
    content = [i[0] for i in res]
    cate = [i[1] for i in res]
    data = pd.DataFrame({'content': content, 'cate': cate})
    return data

In [28]:
input_file = '../data/good/aspect_category.csv'
input_file_bad = '../data/bad/aspect_category.csv'
df_good = get_df(input_file)
df_bad = get_df(input_file_bad)

data = pd.concat([df_good,df_bad]).reset_index()

In [29]:
print ("data.shape: ", data.shape)

data.shape:  (5304, 3)


# kmeans

In [30]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn import metrics
import os
import torch
from tqdm import tqdm

In [31]:
%%time


device = torch.device(f'cuda:{0}')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased").to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU times: user 2.24 s, sys: 461 ms, total: 2.7 s
Wall time: 2.24 s


In [32]:
%%time
pooled_ls = []

for i in tqdm(range(len(data))):
    inputs = tokenizer(data['content'][i], padding="max_length", truncation=True, max_length=56, return_tensors="pt").to(device)
    bert_output = model(**inputs)
    pooled = bert_output.pooler_output.to(torch.device('cpu'))
    pooled_ls.append(pooled.detach().numpy().tolist())
    del bert_output
    del inputs
    del pooled
    torch.cuda.empty_cache()

100%|██████████| 5304/5304 [01:58<00:00, 44.85it/s]

CPU times: user 1min 37s, sys: 21.8 s, total: 1min 58s
Wall time: 1min 58s





In [33]:
%%time

#pooled = bert_output.pooler_output.to(torch.device('cpu'))
bert_df = pd.DataFrame({"sentence": data['content'].tolist(), "embedding": pooled_ls})
df = pd.concat([data, bert_df], axis=1)
bert_features = pd.DataFrame(pooled_ls)
x = np.reshape(np.array(pooled_ls), (len(bert_features), 768))
print("the data shape is {}".format(len(bert_features)))

the data shape is 5304
CPU times: user 264 ms, sys: 353 µs, total: 264 ms
Wall time: 261 ms


In [34]:
Best_K = 100
mb_kmeans = MiniBatchKMeans(n_clusters=Best_K)
y_pred = mb_kmeans.fit_predict(x)

In [35]:
# 保存聚类结果
feat_names_Kmeans = "Kmeans_" + str(Best_K)
train_kmeans = pd.concat([pd.Series(name=feat_names_Kmeans, data=y_pred), df], axis=1)

In [36]:
train_kmeans.head()

Unnamed: 0,Kmeans_100,index,content,cate,sentence,embedding
0,74,0,comfortable,feelings,comfortable,"[[-0.7401604056358337, -0.07353582978248596, 0..."
1,24,1,being able to put your phone in your pocket,expectancy,being able to put your phone in your pocket,"[[-0.9051737785339355, -0.4930358827114105, -0..."
2,84,2,green color was perfect,color,green color was perfect,"[[-0.8602860569953918, -0.16006290912628174, -..."
3,37,3,running,scene,running,"[[-0.6941463947296143, -0.09636484086513519, 0..."
4,70,4,have the black ones,purchase_behavior,have the black ones,"[[-0.8123378753662109, -0.17152729630470276, 0..."


In [37]:
# output result 
res_ls = []
num = []
cate_ls = []
normed_key = []
for i in range(Best_K):
    num.append(i)
    x = train_kmeans[train_kmeans[feat_names_Kmeans]==i]['sentence'].unique().tolist()
    cate_ls.append(train_kmeans[train_kmeans[feat_names_Kmeans]==i]['cate'].tolist()[0])
    try:
        normed_key.append(x[0])
    except:
        normed_key.append("null")
    res_ls.append(x)

df_res = pd.DataFrame({'number': num,'normed_word': normed_key,'sentences': res_ls, 'cate':cate_ls})
df_res = df_res[df_res['normed_word']!='null']

In [38]:
df_res.to_csv("raw_data_clustered.csv", index=False)

In [40]:
#keys to center
df_key_dict = df_res[['normed_word','cate']].drop_duplicates()
df_key_dict.to_csv('key.csv',index = False)

In [42]:
df_key_dict[df_key_dict['cate']=='size']

Unnamed: 0,normed_word,cate
0,squeeze my waist so tight,size
1,fit perfectly!,size
10,fit well,size
17,They are longer in the leg and higher in the w...,size
19,fit true to size,size
21,great for my legs,size
29,length is really good,size
31,The fit is really flattering too,size
32,is tight in a good way and very comfy,size
34,fit fine,size


## func to normalize & extract feelings

In [47]:
input = ("fit well", "size")
df_keys = pd.read_csv('key.csv')
key_ls = df_keys[df_keys['cate']==input[1]]['normed_word'].tolist()

In [48]:
key_ls

['squeeze my waist so tight',
 'fit perfectly!',
 'fit well',
 'They are longer in the leg and higher in the waist',
 'fit true to size',
 'great for my legs',
 'length is really good',
 'The fit is really flattering too',
 'is tight in a good way and very comfy',
 'fit fine',
 're not too short',
 'be a little short',
 'perfect fit',
 'fit']