In [1]:
pip install liwc

Defaulting to user installation because normal site-packages is not writeable
Collecting liwc
  Downloading liwc-0.5.0-py2.py3-none-any.whl (5.1 kB)
Installing collected packages: liwc
Successfully installed liwc-0.5.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import liwc
import jieba
from collections import Counter

# load data

In [60]:
df = pd.read_excel('../data/sentiment_analysis_latest_final.xlsx')

In [61]:
df['sentiment_label'].value_counts()

1    3267
0    1943
2     959
Name: sentiment_label, dtype: int64

# tokenization

In [62]:
# load self-defined tokenization dictionary
jieba.set_dictionary('../data/tokenization_dict/dict.txt')
jieba.load_userdict('../data/tokenization_dict/hk_dict.txt')

Building prefix dict from C:\Users\Harrison\Documents\HKU\paper\GPT_sentiment_analysis\supplementary_material\data\tokenization_dict\dict.txt ...
Loading model from cache C:\Users\Harrison\AppData\Local\Temp\jieba.uca32d7fd4479c948cd6f929a868b2453.cache
Loading model cost 0.862 seconds.
Prefix dict has been built successfully.


In [63]:
# keep Chinese char, and then remove stopwords
speech_list = list(df['msg_replace'])
rule = re.compile(r"[^\u4e00-\u9fa5]")
speech_list = [list(jieba.cut(rule.sub('', speech))) for speech in speech_list]

In [64]:
df['msg_token'] = speech_list

In [65]:
def _parse_categories(lines):
    """
    Read (category_id, category_name) pairs from the categories section.
    Each line consists of an integer followed a tab and then the category name.
    This section is separated from the lexicon by a line consisting of a single "%".
    """
    for line in lines:
        line = line.strip()
        if line == "%":
            return
        # ignore non-matching groups of categories
        if "\t" in line:
            category_id, category_name = line.split("\t", 1)
            yield category_id, category_name


def _parse_lexicon(lines, category_mapping):
    """
    Read (match_expression, category_names) pairs from the lexicon section.
    Each line consists of a match expression followed by a tab and then one or more
    tab-separated integers, which are mapped to category names using `category_mapping`.
    """
    for line in lines:
        line = line.strip()
        parts = line.split("\t")
        yield parts[0], [category_mapping[category_id] for category_id in parts[1:]]


def read_dic(filepath):
    """
    Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
    (lexicon, category_names), where:
    * `lexicon` is a dict mapping string patterns to lists of category names
    * `category_names` is a list of category names (as strings)
    """
    with open(filepath, encoding='utf_8_sig') as lines:
        # read up to first "%" (should be very first line of file)
        for line in lines:
            if line.strip() == "%":
                break
        # read categories (a mapping from integer string to category name)
        category_mapping = dict(_parse_categories(lines))
        # read lexicon (a mapping from matching string to a list of category names)
        lexicon = dict(_parse_lexicon(lines, category_mapping))
    return lexicon, list(category_mapping.values())

In [66]:
parse, category_names = read_dic('../data/Traditional_Chinese_LIWC2015_Dictionary_v1.5.dic')

In [67]:
# psychological meaningful categories in LIWC
category_names

['function (Function Words)',
 'pronoun (Pronouns)',
 'ppron (Personal Pronouns)',
 'i (I)',
 'we (We)',
 'you (You)',
 'shehe (SheHe)',
 'they (They)',
 'youpl (youpl)',
 'ipron (Impersonal Pronouns)',
 'prep (Prepositions)',
 'auxverb (Auxiliary Verbs)',
 'adverb (Adverbs)',
 'conj (Conjunctions)',
 'negate (Negations)',
 'quanunit (quanunit)',
 'prepend (prepend)',
 'specart (specart)',
 'particle (particle)',
 'modal_pa (modal_pa)',
 'general_pa (general_pa)',
 'compare (Comparisons)',
 'interrog (Interrogatives)',
 'number (Numbers)',
 'quant (Quantifiers)',
 'tensem (tensem)',
 'focuspast (Past Focus)',
 'focuspresent (Present Focus)',
 'focusfuture (Future Focus)',
 'progm (progm)',
 'affect (Affect)',
 'posemo (Positive Emotions)',
 'negemo (Negative Emotions)',
 'anx (Anx)',
 'anger (Anger)',
 'sad (Sad)',
 'social (Social)',
 'family (Family)',
 'friend (Friends)',
 'female (Female)',
 'male (Male)',
 'cogproc (Cognitive Processes)',
 'insight (Insight)',
 'cause (Causal)',
 

In [68]:
df['liwc_sentiment'] = [[] for _ in range(len(df))]

In [69]:
# negemo (Negative Emotions) for negative sentiment & posemo (Positive Emotions) for positive sentiment
for i in range(len(df['msg_token'])):
    for j in range(len(df['msg_token'][i])):
        try:
            if 'negemo (Negative Emotions)' in parse[df['msg_token'][i][j]]:
                df['liwc_sentiment'][i].append('negemo (Negative Emotions)')
            elif 'posemo (Positive Emotions)' in parse[df['msg_token'][i][j]]:
                df['liwc_sentiment'][i].append('posemo (Positive Emotions)')
        except:
            pass

In [70]:
df['liwc_sentiment_label'] = None

In [71]:
# select the most frequent sentiment in each message. If no sentiment labeled by liwc, label it as 'neutral'
from collections import Counter
 
def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]
   
for i in range(len(df['liwc_sentiment'])):
    if len(df['liwc_sentiment'][i]) != 0:
        liwc_label = most_frequent(df['liwc_sentiment'][i])
        df['liwc_sentiment_label'][i] = liwc_label
    else:
        df['liwc_sentiment_label'][i] = 'neutral'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['liwc_sentiment_label'][i] = 'neutral'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['liwc_sentiment_label'][i] = liwc_label


In [72]:
replace_label = {'negemo (Negative Emotions)':0, 'neutral':1, 'posemo (Positive Emotions)':2}
df['liwc_sentiment_label'] = df['liwc_sentiment_label'].replace(replace_label)

# evaluation

In [101]:
from sklearn.metrics import classification_report
import seaborn as sns


y_true = df['sentiment_label'].tolist()
y_pred = df['liwc_sentiment_label'].tolist()

# 生成分类报告
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.55      0.32      0.40      1943
           1       0.69      0.59      0.64      3267
           2       0.26      0.60      0.36       959

    accuracy                           0.51      6169
   macro avg       0.50      0.50      0.47      6169
weighted avg       0.58      0.51      0.52      6169

