In [1]:
import pandas as pd
import numpy as np
import re
import jieba
import matplotlib.pyplot as plt

In [40]:
df = pd.read_excel('../data/sentiment_analysis_latest.xlsx')

In [41]:
df['sentiment_label'].value_counts()

neutral     3267
negative    1943
positive     959
Name: sentiment_label, dtype: int64

In [42]:
replace_tag = {'positive':2, 'neutral':1, 'negative':0}
df['sentiment_label']= df['sentiment_label'].replace(replace_tag)

In [43]:
df['sentiment_label'].value_counts()

1    3267
0    1943
2     959
Name: sentiment_label, dtype: int64

# tokenization

In [6]:
# load self-defined tokenization dictionary
jieba.set_dictionary('../data/tokenization_dict/dict.txt')
jieba.load_userdict('../data/tokenization_dict/hk_dict.txt')

Building prefix dict from C:\Users\Harrison\Downloads\2023_07_19_supplemet_material\2023_07_19_supplemet_material\data\tokenization_dict\dict.txt ...
Loading model from cache C:\Users\Harrison\AppData\Local\Temp\jieba.u92fdb9fc4f964cbd4d35e3368f83bac9.cache
Loading model cost 1.215 seconds.
Prefix dict has been built successfully.


In [7]:
# import stopwords list
import pycantonese
stop_words = pycantonese.stop_words()
stop_words  

{'一啲',
 '一定',
 '不如',
 '不過',
 '之後',
 '乜',
 '乜嘢',
 '人哋',
 '但係',
 '你',
 '你哋',
 '佢',
 '佢哋',
 '係',
 '個',
 '其他',
 '冇',
 '再',
 '到',
 '即',
 '即係',
 '原來',
 '去',
 '又',
 '可以',
 '可能',
 '同',
 '同埋',
 '吖',
 '呀',
 '呢',
 '咁',
 '咗',
 '咩',
 '咪',
 '哦',
 '哩',
 '哩個',
 '哩啲',
 '哩度',
 '哩樣',
 '唔',
 '唔使',
 '唔係',
 '啊',
 '啲',
 '喎',
 '喺',
 '喺度',
 '嗯',
 '嗰',
 '嗰個',
 '嗰啲',
 '嗰度',
 '嘅',
 '嘢',
 '噉',
 '噉樣',
 '因為',
 '多',
 '太',
 '好',
 '如果',
 '就',
 '已經',
 '幾',
 '幾多',
 '得',
 '想',
 '應該',
 '成日',
 '我',
 '我哋',
 '或者',
 '所以',
 '最',
 '會',
 '有',
 '有冇',
 '有啲',
 '未',
 '梗係',
 '然之後',
 '由',
 '真係',
 '睇',
 '知',
 '而',
 '而家',
 '自己',
 '要',
 '覺得',
 '話',
 '諗',
 '講',
 '譬如',
 '跟住',
 '返',
 '過',
 '邊個',
 '都',
 '點',
 '點樣',
 '點解'}

In [44]:
# keep Chinese char, English words and numbers, and then remove stopwords
stop_words = [w.replace('\n', '') for w in stop_words]
stop_words = [w.replace(' ', '') for w in stop_words]
speech_list = list(df['msg_replace'])
rule = re.compile(r"[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u4e00-\u9fa5]")
speech_list = [list(jieba.cut(rule.sub('', speech))) for speech in speech_list]
for idx, speech in enumerate(speech_list):
    speech_list[idx] = ' '.join([word for word in speech if word.strip() not in stop_words])

In [45]:
df['msg_token'] = speech_list

# trainng and test set split

In [12]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

In [18]:
MAX_NB_WORDS = 3000
MAX_SEQUENCE_LENGTH = 300

df['msg_token'] = df['msg_token'].fillna("")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['msg_token'].values)
word_index = tokenizer.word_index
print('There are totally %s different tokens.' % len(word_index))

There are totally 10442 different tokens.


In [19]:
# vectorization
X = tokenizer.texts_to_sequences(df['msg_token'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
Y = df['sentiment_label']

print(X.shape)
print(Y.shape)

(6169, 300)
(6169,)


In [20]:
# split training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(4935, 300) (4935,)
(1234, 300) (1234,)


# Logistic regression

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

model_lg = LogisticRegression()
model_lg.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
model_lg_acc = cross_val_score(estimator=model_lg, X=X_train, y=Y_train, cv=5, n_jobs=-1)
model_lg_acc

array([0.55927052, 0.53292806, 0.53495441, 0.55015198, 0.54609929])

In [26]:
print(model_lg.score(X_test, Y_test))

0.5494327390599676


In [48]:
# F1
from sklearn.metrics import f1_score

Y_pred = model_lg.predict(X_test)

f1_score(Y_test, Y_pred, average='micro')

0.5494327390599676

In [51]:
f1_score(Y_test, Y_pred, average='macro')

0.3373299186956398

In [56]:
f1_score(Y_test, Y_pred, average='weighted')

0.47210588608840226

# SVM

In [34]:
from sklearn import svm

model_svm = svm.SVC(C=0.1, kernel='rbf')
model_svm.fit(X_train, Y_train)

In [35]:
model_svm_acc = cross_val_score(estimator=model_svm, X=X_train, y=Y_train, cv=5, n_jobs=-1)
model_svm_acc

array([0.56940223, 0.55319149, 0.54407295, 0.56534954, 0.56231003])

In [38]:
print(model_svm.score(X_test, Y_test))

0.5672609400324149


In [49]:
# F1
from sklearn.metrics import f1_score

y_pred = model_svm.predict(X_test)

f1_score(Y_test, y_pred, average='micro')

0.5672609400324149

In [50]:
f1_score(Y_test, y_pred, average='macro')

0.35392623658566663