In [1]:
import pandas as pd
import numpy as np
import re
import jieba
import matplotlib.pyplot as plt
from opencc import OpenCC

In [2]:
df = pd.read_excel('../data/sentiment_analysis_latest.xlsx')

In [3]:
df['sentiment_label'].value_counts()

neutral     3267
negative    1943
positive     959
Name: sentiment_label, dtype: int64

In [40]:
df[(df['sentiment_label'] == 2)].sample(6)

Unnamed: 0,conversationId,datetime,from_whom,msg_replace,pp,max_pp,sentiment_label
4288,543101a7-3c19-4fa5-9c9c-70f5a1b86c18,2021-11-27 21:02:05,c:,其實我感覺到你對課堂嘅重視..會好用心準備..校長嘅說話都確實難免令人耿耿於懷 幾個月其實都...,24,59,2
5792,734dfb95-747a-4c3e-bd6a-d90bab176628,2023-03-31 01:10:15,c:,我聽到嘅[***name***]對於4年入面自己嘅表現都好後悔，覺得係分手嘅主因 但係我見到...,12,73,2
581,50f95550-8d13-4973-b576-8f38fe5e3d08,2022-01-09 03:28:11,c:,"好多謝你今日肯花時間同我分享咁多，雖然我地未必可以諗到一個完美嘅解決方法,不過希望你今日傾過...",82,85,2
2323,17886c87-016c-480a-85fe-99362878a940,2020-10-06 03:40:24,c:,你響屋企感覺到背叛同敵意，其實都對你造成好大傷害，咁呢一刻離開呢個家，都係一個機會比你可以抖...,16,77,2
2045,e7cb758c-97c2-4596-a230-4e78adb9fc72,2022-11-22 08:42:47,c:,嗯嗯 可能[***name***]都習慣左去諗多d~ 都係出於好意~ 為未來負責任~ 平時如...,38,49,2
6154,5347794a-d26c-4a62-9629-343fe80d6ecd,2023-01-12 23:08:16.400000,c:,唔緊要呢～我地好free,90,104,2


In [4]:
replace_tag = {'positive':2, 'neutral':1, 'negative':0}
df['sentiment_label']= df['sentiment_label'].replace(replace_tag)

In [5]:
df['sentiment_label'].value_counts()

1    3267
0    1943
2     959
Name: sentiment_label, dtype: int64

In [6]:
df_msg = df.copy()

# tokenization

In [7]:
import jieba
from pycantonese.word_segmentation import Segmenter
import pycantonese

In [8]:
# load self-defined tokenization dictionary
jieba.set_dictionary('../data/tokenization_dict/dict.txt')
jieba.load_userdict('../data/tokenization_dict/hk_dict.txt')

Building prefix dict from C:\Users\Harrison\Documents\HKU\paper\GPT_sentiment_analysis\supplementary_material\data\tokenization_dict\dict.txt ...
Loading model from cache C:\Users\Harrison\AppData\Local\Temp\jieba.uca32d7fd4479c948cd6f929a868b2453.cache
Loading model cost 0.648 seconds.
Prefix dict has been built successfully.


In [9]:
s2hk=[]
converter = OpenCC('s2hk.json')
for i in range(len(df_msg)):
    s2hk.append(converter.convert(df_msg.msg_replace[i]))
df_msg['msg_replace']=s2hk

In [10]:
# stopwords
stop_words=['乜','個','嗰','吖','啦','又','呀','咗','咁','呢','咩','哦','哩','啲','啊','喎','嗯','噢','喔',
            '果','係','左','先','架','㗎','噶','嘎','嘅','既','嘢','噉','即','同','都','有','冇','的','黎','嚟',
            '啦', '啵', '喺', '嗱', '嘅', '噃', '咁', '噉', '噓', '唔', '嘛', '咩', '嘢', '啊', '嗚', '嘻', '啫',
            '啱', '添', '喇', '甘', '咯', '啊']

cn_stopwords=list(pd.read_csv("../data/stopwords/cn_stopwords.txt").to_numpy().squeeze())
scu_stopwords=list(pd.read_csv("../data/stopwords/scu_stopwords.txt").to_numpy().squeeze())
traditional_cn_stopwords = []
traditional_scu_stopwords = []
converter = OpenCC('s2hk.json')
for i in range(len(cn_stopwords)):
    traditional_cn_stopwords.append(converter.convert(cn_stopwords[i]))
for i in range(len(scu_stopwords)):
    traditional_scu_stopwords.append(converter.convert(scu_stopwords[i]))
    
pycantonese_stopwords=list(pycantonese.stop_words())

stop_words_total = stop_words + traditional_cn_stopwords + traditional_scu_stopwords + pycantonese_stopwords

In [11]:
# tokenization
speech_list = list(df_msg['msg_replace'])
rule = re.compile(r"[^\u4e00-\u9fa5]")
speech_list = [list(jieba.cut(rule.sub('', speech))) for speech in speech_list]

In [12]:
# remove stopwords
for idx, speech in enumerate(speech_list):
    speech_list[idx] = ' '.join([word for word in speech if word.strip() not in stop_words_total])

df_msg['msg_token'] = speech_list

In [13]:
df_msg['msg_token'] = [i.split(' ') for i in df_msg['msg_token']]

# construct embedding layer with word2vec

In [14]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from gensim.models import Word2Vec

In [15]:
MAX_NB_WORDS = 4096

df_msg['msg_token'] = df_msg['msg_token'].fillna("")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_msg['msg_token'].values)
word_index = tokenizer.word_index
print('There are totally %s different tokens.' % len(word_index))

There are totally 8914 different tokens.


In [16]:
w2v_model = Word2Vec.load('../word2vec_embedding/openup_word2vec_20231001.model')

In [17]:
def get_message_vector(msg, model):
    word_vectors = [model.wv[word] for word in msg if word in model.wv.key_to_index]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    message_vector = np.mean(word_vectors, axis=0)
    return message_vector

# Get the vector representation of each message
message_vectors = np.array([get_message_vector(msg, w2v_model) for msg in df_msg['msg_token']])

In [18]:
labels = df_msg['sentiment_label']

In [19]:
# split training and test set
X_train, X_test, Y_train, Y_test = train_test_split(message_vectors, labels, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(4935, 100) (4935,)
(1234, 100) (1234,)


# Logistic regression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [21]:
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, Y_train)
lr_predictions = lr_model.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(Y_test, lr_predictions))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.59      0.62       380
           1       0.68      0.81      0.74       666
           2       0.55      0.27      0.36       188

    accuracy                           0.66      1234
   macro avg       0.63      0.56      0.57      1234
weighted avg       0.65      0.66      0.64      1234



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
model_lg_acc = cross_val_score(estimator=lr_model, X=X_train, y=Y_train, cv=5, n_jobs=-1)
model_lg_acc

array([0.65957447, 0.65248227, 0.63424519, 0.68490375, 0.64944276])

In [23]:
print(lr_model.score(X_test, Y_test))

0.660453808752026


In [24]:
# F1
from sklearn.metrics import f1_score

Y_pred = lr_model.predict(X_test)

f1_score(Y_test, Y_pred, average='micro')

0.660453808752026

In [25]:
f1_score(Y_test, Y_pred, average='macro')

0.5725007300768382

In [26]:
f1_score(Y_test, Y_pred, average='weighted')

0.6440150834186715

# SVM

In [27]:
from sklearn import svm

In [28]:
# SVM
svm_model = svm.SVC(C=0.1, kernel='rbf')
svm_model.fit(X_train, Y_train)
svm_predictions = svm_model.predict(X_test)
print("SVM Classification Report:")
print(classification_report(Y_test, svm_predictions))

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.57      0.61       380
           1       0.65      0.87      0.75       666
           2       0.82      0.07      0.14       188

    accuracy                           0.66      1234
   macro avg       0.71      0.50      0.50      1234
weighted avg       0.68      0.66      0.61      1234



In [29]:
model_svm_acc = cross_val_score(estimator=svm_model, X=X_train, y=Y_train, cv=5, n_jobs=-1)
model_svm_acc

array([0.67173252, 0.67071935, 0.65552178, 0.67882472, 0.66160081])

In [30]:
print(svm_model.score(X_test, Y_test))

0.6564019448946515


In [31]:
# F1
from sklearn.metrics import f1_score

y_pred = svm_model.predict(X_test)

f1_score(Y_test, y_pred, average='micro')

0.6564019448946515

In [32]:
f1_score(Y_test, y_pred, average='macro')

0.4973258246045725