In [1]:
pip install liwc

Defaulting to user installation because normal site-packages is not writeable
Collecting liwc
  Downloading liwc-0.5.0-py2.py3-none-any.whl (5.1 kB)
Installing collected packages: liwc
Successfully installed liwc-0.5.0
Note: you may need to restart the kernel to use updated packages.


In [31]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import liwc
import jieba
from collections import Counter

In [14]:
def _parse_categories(lines):
    """
    Read (category_id, category_name) pairs from the categories section.
    Each line consists of an integer followed a tab and then the category name.
    This section is separated from the lexicon by a line consisting of a single "%".
    """
    for line in lines:
        line = line.strip()
        if line == "%":
            return
        # ignore non-matching groups of categories
        if "\t" in line:
            category_id, category_name = line.split("\t", 1)
            yield category_id, category_name


def _parse_lexicon(lines, category_mapping):
    """
    Read (match_expression, category_names) pairs from the lexicon section.
    Each line consists of a match expression followed by a tab and then one or more
    tab-separated integers, which are mapped to category names using `category_mapping`.
    """
    for line in lines:
        line = line.strip()
        parts = line.split("\t")
        yield parts[0], [category_mapping[category_id] for category_id in parts[1:]]


def read_dic(filepath):
    """
    Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
    (lexicon, category_names), where:
    * `lexicon` is a dict mapping string patterns to lists of category names
    * `category_names` is a list of category names (as strings)
    """
    with open(filepath, encoding='utf_8_sig') as lines:
        # read up to first "%" (should be very first line of file)
        for line in lines:
            if line.strip() == "%":
                break
        # read categories (a mapping from integer string to category name)
        category_mapping = dict(_parse_categories(lines))
        # read lexicon (a mapping from matching string to a list of category names)
        lexicon = dict(_parse_lexicon(lines, category_mapping))
    return lexicon, list(category_mapping.values())

In [28]:
parse, category_names = read_dic('../data/Traditional_Chinese_LIWC2015_Dictionary_v1.5.dic')

In [29]:
category_names

['function (Function Words)',
 'pronoun (Pronouns)',
 'ppron (Personal Pronouns)',
 'i (I)',
 'we (We)',
 'you (You)',
 'shehe (SheHe)',
 'they (They)',
 'youpl (youpl)',
 'ipron (Impersonal Pronouns)',
 'prep (Prepositions)',
 'auxverb (Auxiliary Verbs)',
 'adverb (Adverbs)',
 'conj (Conjunctions)',
 'negate (Negations)',
 'quanunit (quanunit)',
 'prepend (prepend)',
 'specart (specart)',
 'particle (particle)',
 'modal_pa (modal_pa)',
 'general_pa (general_pa)',
 'compare (Comparisons)',
 'interrog (Interrogatives)',
 'number (Numbers)',
 'quant (Quantifiers)',
 'tensem (tensem)',
 'focuspast (Past Focus)',
 'focuspresent (Present Focus)',
 'focusfuture (Future Focus)',
 'progm (progm)',
 'affect (Affect)',
 'posemo (Positive Emotions)',
 'negemo (Negative Emotions)',
 'anx (Anx)',
 'anger (Anger)',
 'sad (Sad)',
 'social (Social)',
 'family (Family)',
 'friend (Friends)',
 'female (Female)',
 'male (Male)',
 'cogproc (Cognitive Processes)',
 'insight (Insight)',
 'cause (Causal)',
 

# load data

In [17]:
df = pd.read_excel('../data/sentiment_analysis_latest.xlsx')

In [18]:
df['sentiment_label'].value_counts()

neutral     3267
negative    1943
positive     959
Name: sentiment_label, dtype: int64

In [19]:
replace_tag = {'positive':2, 'neutral':1, 'negative':0}
df['sentiment_label']= df['sentiment_label'].replace(replace_tag)

In [20]:
df['sentiment_label'].value_counts()

1    3267
0    1943
2     959
Name: sentiment_label, dtype: int64

# tokenization

In [23]:
# load self-defined tokenization dictionary
jieba.set_dictionary('../data/tokenization_dict/dict.txt')
jieba.load_userdict('../data/tokenization_dict/hk_dict.txt')

Building prefix dict from C:\Users\Harrison\Downloads\2023_07_19_supplemet_material\2023_07_19_supplemet_material\data\tokenization_dict\dict.txt ...
Loading model from cache C:\Users\Harrison\AppData\Local\Temp\jieba.u92fdb9fc4f964cbd4d35e3368f83bac9.cache
Loading model cost 0.869 seconds.
Prefix dict has been built successfully.


In [26]:
# keep Chinese char, English words and numbers, and then remove stopwords
speech_list = list(df['msg_replace'])
rule = re.compile(r"[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u4e00-\u9fa5]")
speech_list = [list(jieba.cut(rule.sub('', speech))) for speech in speech_list]

In [27]:
df['msg_token'] = speech_list

In [None]:
gettysburg_tokens

In [None]:
Counter(category for token in gettysburg_tokens for category in parse(token))

In [34]:
#LIWC Features Extraction
liwc =[] 
for item in df['msg_token']:
    gettysburg_counts = list(Counter(category for token in item for category in parse(token) if category == 'posemo').items())
    liwc.append(gettysburg_counts)
liwc_ = np.array(liwc)
liwc_

TypeError: 'dict' object is not callable

In [49]:
from collections import Counter
gettysburg_counts = Counter(category for token in gettysburg_tokens for category in parse(token))
print(gettysburg_counts)

Counter()


In [None]:
'posemo (Positive Emotions)',
 'negemo (Negative Emotions)',
'anx (Anx)',
 'anger (Anger)',
 'sad (Sad)',

In [42]:
df['msg_token_str'] = [' '.join(i) for i in df['msg_token']]

In [43]:
df['msg_token_str'][1]

'name 你好 呀 今日 有 咩 想 同 我哋 傾 呀'

In [36]:
def tokenize(text):
    # you may want to use a smarter tokenizer
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)

In [38]:
gettysburg = '''Four score and seven years ago our fathers brought forth on
  this continent a new nation, conceived in liberty, and dedicated to the
  proposition that all men are created equal. Now we are engaged in a great
  civil war, testing whether that nation, or any nation so conceived and so
  dedicated, can long endure. We are met on a great battlefield of that war.
  We have come to dedicate a portion of that field, as a final resting place
  for those who here gave their lives that that nation might live. It is
  altogether fitting and proper that we should do this.'''.lower()
gettysburg_tokens = tokenize(gettysburg)
print(*gettysburg_tokens)

four score and seven years ago our fathers brought forth on this continent a new nation conceived in liberty and dedicated to the proposition that all men are created equal now we are engaged in a great civil war testing whether that nation or any nation so conceived and so dedicated can long endure we are met on a great battlefield of that war we have come to dedicate a portion of that field as a final resting place for those who here gave their lives that that nation might live it is altogether fitting and proper that we should do this


In [39]:
gettysburg_tokens

<generator object tokenize at 0x00000211B7D0C900>

In [48]:
parse('我的')

TypeError: 'dict' object is not callable