In [3]:
#Training semi-supervised Anchored COREX model
import numpy as np
import scipy.sparse as ss
import pickle
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from ast import literal_eval

data_anchored_corex = pd.read_csv("Preprocessed_Scraped_reviews.csv")
train_topics = pd.read_csv("Training_topics.csv")
df_anchored_corex = pd.DataFrame()

data_words_anchored_corex = []
for x in data_anchored_corex['Clean']:
    data_words_anchored_corex.append(' '.join(literal_eval(x)))
keywords = []
for x in train_topics['Keys']:
    keywords.append(literal_eval(x))

vectorizer_anchored_corex = CountVectorizer(stop_words='english', max_features=20000, binary=True)

doc_word_anchored_corex = vectorizer_anchored_corex.fit_transform(data_words_anchored_corex)
doc_word_anchored_corex = ss.csr_matrix(doc_word_anchored_corex)

words = list(np.asarray(vectorizer_anchored_corex.get_feature_names_out()))

#Train the CorEx topic model with 6 topics
topic_model_anchored_corex = ct.Corex(n_hidden=6, words=words, max_iter=1000, verbose=False, seed=2022)
topic_model_anchored_corex.fit(doc_word_anchored_corex, words=words, anchors = keywords, anchor_strength=3);

#Save the model and topics
pickle.dump(topic_model_anchored_corex, open('CorEx_Train_model_3May23.sav', 'wb'))
topic_list_anchored_corex = topic_model_anchored_corex.get_topics()

df_anchored_corex['Topics'] = topic_list_anchored_corex
df_anchored_corex.to_excel('CorEx_topics_final_3May23.xlsx')



In [4]:
# Print all topics from the CorEx topic model
anchored_corex_topics = topic_model_anchored_corex.get_topics()
for n,topic in enumerate(anchored_corex_topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: processor, storage, memory, ram, gb, video, web, core, cpu, task
1: laptop, great, purchase, product, recommend, happy, wonderful, highly, definitely, good
2: size, display, lightweight, keyboard, big, large, type, screen, touchscreen, slim
3: work, school, game, gaming, business, software, homework, productivity, schoolwork, play
4: great, product, store, wonderful, price, apple, service, value, quality, deal
5: battery, life, charge, port, long, hold, usb, fingerprint, hour, print


In [8]:
#label the review with topics
import pandas as pd
import pickle
from ast import literal_eval

data = pd.read_csv("Preprocessed_Scraped_reviews.csv")
aspect_list = [[],[],[],[],[],[]]
words_set = []
for x in data['Clean']:
    words_set.append(set(literal_eval(x)))

model = pickle.load(open("CorEx_Train_model_3May23.sav", 'rb'))
topic_list = []
for i, topic_words in enumerate(model.get_topics()):
  topic_list.append(set([words[0] for words in topic_words if words[1] > 0]))

for words in words_set:
  for i,topic_words in enumerate(topic_list):
      if (words & topic_words):
        aspect_list[i].append(1)
      else :
        aspect_list[i].append(0)
for i in range(6):
  data['Topic ' + str(i)] = aspect_list[i]
data.to_csv('labelled_aspect_3May23.csv')