In [1]:
import os
import pickle 
import pandas as pd
import re
import matplotlib.pyplot as plt 
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

# NTUSD

為金融社交媒體數據應用的市場情緒詞典，構建的詞典 NTUSD-Fin 中有 8,331 個單詞 + 115 個表情符號。

- token：單詞、標籤或表情符號
- bull_freq：看漲集中的頻率
- bear_freq：看跌集合中的頻率 
- bull_cfidf：看漲集合中的收集頻率
- bear_cfidf：看跌集合中的收集頻率
- chi_squared：token的卡方檢驗結果
- market_sentiment：由看漲 PMI 減去看跌 PMI 計算得出
- word_vec：300維詞向量

In [3]:
name = "C:/Users/user/Desktop/Chinese_derivative_1229/NTUSD_Fin_sentiment.json"
df_score = pd.read_json(name, encoding="utf-8")
# df_score = df[['token', 'market_sentiment']]
df_score

Unnamed: 0,token,market_sentiment
0,筆記,1.081211
1,機械手,1.221692
2,吸,-1.507409
3,車庫,1.069689
4,艾滋病,0.922132
...,...,...
8441,☕,1.141662
8442,😀,-0.644559
8443,👍,1.100400
8444,👆,1.059200


In [4]:
df_score['market_sentiment'].describe()

count    8446.000000
mean        0.570526
std         0.950257
min        -4.418490
25%         0.325545
50%         1.010188
75%         1.128583
max         1.221692
Name: market_sentiment, dtype: float64

# MinMaxScaler

In [14]:
# Step2: minMax scale
scaler = MinMaxScaler()
cols_to_norm = ['market_sentiment']
df_score[cols_to_norm] = scaler.fit_transform(df_score[cols_to_norm].values.reshape(-1,1))

# Step3: Linear transform
df_score[cols_to_norm] = df_score[cols_to_norm].apply(lambda x: x*2-1)

In [15]:
df_score.describe()

Unnamed: 0,market_sentiment
count,8446.0
mean,0.769097
std,0.33696
min,-1.0
25%,0.682227
50%,0.925001
75%,0.966984
max,1.0


In [16]:
# 檢查有多少自在+-2個標準差內
mean = df_score['market_sentiment'].mean()
std = df_score['market_sentiment'].std()
count = 0
sz = len(df_score['market_sentiment'])
for i in range(sz):
    if -1 <= df_score['market_sentiment'][i] <= 0:
        count +=1
print(f"-1~0: {count}/{sz}")

count = 0
sz = len(df_score['market_sentiment'])
for i in range(sz):
    if 0 <= df_score['market_sentiment'][i] <= 1:
        count +=1
print(f"0~1: {count}/{sz}")

-1~0: 404/8446
0~1: 8042/8446


# Normalization

In [4]:
# # Scaled
# df_score['Normalized_Sentiment_Score'] = (df_score['market_sentiment'] - df_score['market_sentiment'].mean())/df_score['market_sentiment'].std()
# df_score

Unnamed: 0,token,market_sentiment,Normalized_Sentiment_Score
0,筆記,1.081211,0.537418
1,機械手,1.221692,0.685253
2,吸,-1.507409,-2.186708
3,車庫,1.069689,0.525293
4,艾滋病,0.922132,0.370012
...,...,...,...
8441,☕,1.141662,0.601034
8442,😀,-0.644559,-1.278691
8443,👍,1.100400,0.557612
8444,👆,1.059200,0.514255


In [5]:
# df_score.describe()

Unnamed: 0,market_sentiment,Normalized_Sentiment_Score
count,8446.0,8446.0
mean,0.570526,-3.372838e-15
std,0.950257,1.0
min,-4.41849,-5.250174
25%,0.325545,-0.2578051
50%,1.010188,0.4626771
75%,1.128583,0.5872694
max,1.221692,0.6852528


In [6]:
# df_score['Normalized_Sentiment_Score'].quantile(0.99)

0.6852527853275587

In [7]:
# # 檢查有多少自在+-2個標準差內
# mean = df_score['Normalized_Sentiment_Score'].mean()
# std = df_score['Normalized_Sentiment_Score'].std()
# count = 0
# sz = len(df_score['Normalized_Sentiment_Score'])
# for i in range(sz):
#     if mean - std *2 <= df_score['Normalized_Sentiment_Score'][i] <= mean:
#         count +=1
# print(f"-2~0: {count}/{sz}")

# count = 0
# sz = len(df_score['Normalized_Sentiment_Score'])
# for i in range(sz):
#     if mean <= df_score['Normalized_Sentiment_Score'][i] <= mean + std*2:
#         count +=1
# print(f"0~2: {count}/{sz}")

-2~0: 1862/8446
0~2: 6079/8446


還是biased

In [8]:
# df_score = df_score.drop(columns=['market_sentiment'])

# Add the Opinion Finder words

In [9]:
str_pos_point = df_score['Normalized_Sentiment_Score'].quantile(0.78, interpolation="nearest")
weak_pos_point = df_score['Normalized_Sentiment_Score'].quantile(0.52, interpolation="nearest")
str_neg_point = df_score['Normalized_Sentiment_Score'].quantile(0.2, interpolation="nearest")
weak_neg_point = df_score['Normalized_Sentiment_Score'].quantile(0.25, interpolation="nearest")

In [17]:
# Read translated words
with open('C:/Users/user/Desktop/Chinese_derivative_1229/strong_positive.txt') as f:
    strong_positive_trans = f.readlines()
with open('C:/Users/user/Desktop/Chinese_derivative_1229/strong_negative.txt') as f:
    strong_negative_trans = f.readlines()
with open('C:/Users/user/Desktop/Chinese_derivative_1229/words_and_labels/weak_positive.txt') as f:
    weak_positive_trans = f.readlines()
with open('C:/Users/user/Desktop/Chinese_derivative_1229/words_and_labels/weak_negative.txt') as f:
    weak_negative_trans = f.readlines()

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/user/Desktop/Chinese_derivative_1229/strong_positive.txt'

In [12]:
def preprocess(text):
    """
    remove numbers, english letters, line break and blank.
    """
    text = re.sub('[\d]','',text) # [0-9]
    text = text.replace("'", '')
    text = text.replace("[", '')
    text = text.replace("]", '')
    text = re.sub('[a-zA-Z]','', text)
    text = text.replace("\n", "")
    text = text.replace(" ", "")
    return text

In [13]:
strong_positive = preprocess(strong_positive_trans[0])
strong_positive = strong_positive.split(',')
strong_positive = list(filter(None, strong_positive))

strong_negative = preprocess(strong_negative_trans[0])
strong_negative = strong_negative.split(',')
strong_negative = list(filter(None, strong_negative))

weak_positive = preprocess(weak_positive_trans[0])
weak_positive = weak_positive.split(',')
weak_positive = list(filter(None, weak_positive))

weak_negative = preprocess(weak_negative_trans[0])
weak_negative = weak_negative.split(',')
weak_negative = list(filter(None, weak_negative))

In [14]:
# Create dataframe
strong_positive_df = pd.DataFrame()
strong_positive_df['token'] = strong_positive
strong_positive_df['Normalized_Sentiment_Score'] = str_pos_point

strong_negative_df = pd.DataFrame()
strong_negative_df['token'] = strong_negative
strong_negative_df['Normalized_Sentiment_Score'] = str_neg_point

weak_positive_df = pd.DataFrame()
weak_positive_df['token'] = weak_positive
weak_positive_df['Normalized_Sentiment_Score'] = weak_pos_point

weak_negative_df = pd.DataFrame()
weak_negative_df['token'] = weak_negative
weak_negative_df['Normalized_Sentiment_Score'] = weak_neg_point

# Concat together

In [15]:
new_df_score = pd.DataFrame()
pdList = [df_score, strong_positive_df, strong_negative_df, weak_positive_df, weak_negative_df] 
new_df_score = pd.concat(pdList)

In [16]:
new_df_score = new_df_score.reset_index(drop=True)
new_df_score

Unnamed: 0,token,Normalized_Sentiment_Score
0,筆記,0.537418
1,機械手,0.685253
2,吸,-2.186708
3,車庫,0.525293
4,艾滋病,0.370012
...,...,...
15235,堅定,-0.257872
15236,飢餓的,-0.257872
15237,盛行,-0.257872
15238,進攻,-0.257872


In [17]:
new_df_score.describe()

Unnamed: 0,Normalized_Sentiment_Score
count,15240.0
mean,-0.035824
std,0.826814
min,-5.250174
25%,-0.585448
50%,0.397877
75%,0.593212
max,0.685253


In [18]:
path = 'C:/Users/USER/Desktop/Prof_Hsieh_Project/about_emotion/Chinese_derivative/words_and_labels'
os.chdir(path)

In [19]:
new_df_score.to_pickle('Sentiment_Score.pkl')