# Get Data

In [1]:
import pandas as pd
import re, datetime as dt

In [2]:
COLS = ["post_id", "text", "timestamp", "lang"]

In [3]:
## English
# load and reformat
df_en = (
    pd.read_csv("data/raw/english_twitter.csv",
                usecols=["tweet_id", "body", "post_date"],
                dtype={"tweet_id": str})
      .rename(columns={"tweet_id": "post_id",
                       "body": "text"})
      .assign(timestamp=lambda d:
              pd.to_datetime(d["post_date"], unit="s", utc=True),
              lang="en")
      .sort_values("timestamp")
      .drop_duplicates(subset="text", keep="first")
      .loc[:, COLS]
      .reset_index(drop=True)
)

# drop duplicates and filter for S&P 500
df_en = df_en.loc[df_en["text"].str.contains(r'(?i)\b(?:s\s*&?\s*p\s*500|sp\s*500)\b', regex=True, na=False)]

In [4]:
print("Rows:", len(df_en))
print(df_en)

Rows: 10458
                     post_id  \
169       550559515515842561   
191       550608100504260610   
543       550806376092807168   
1331      551058514500521985   
3318      552111348130131969   
...                      ...   
3325409  1212055545554898946   
3325544  1212067050409070595   
3325624  1212077265997238274   
3325888  1212108424575700993   
3325998  1212117380236795910   

                                                      text  \
169      Les leaders de 2014:- Dow Jones: $INTC +41%; $...   
191      Weekly S&P500 #Stocks Performance $NBR $GM $QE...   
543      perfectly trading the S&P 500 in 2014 $FB $MU ...   
1331     S&P500 #Stocks Performance $VRTX $LH $DGX $NFL...   
3318     Technology EPS Growth Will Beat S&P 500, Says ...   
...                                                    ...   
3325409       $TSLA to be included in #SP500 $SPY in 2020?   
3325544  These 2 stocks dominated S&P 500 returns in 20...   
3325624  These 2 stocks dominated S&P 500 ret

In [5]:
## Chinese
# load and reformat
cutoff = pd.Timestamp("2019-12-31 23:59:59", tz="UTC")
df_cn = (
    pd.read_csv("data/raw/chinese_guba.csv",
                dtype={"id": str},
                on_bad_lines="skip")
      .rename(columns={"Contents": "text",
                       "Publish Time": "_ts"})
      # build the required columns
      .assign(post_id = lambda _df: "cn_" + _df.index.astype(str),
              timestamp = lambda _df: pd.to_datetime(_df["_ts"],
                                                         utc=True,
                                                         errors="coerce"),
              lang          = "zh")
      # drop rows after the cutoff date
      .query("timestamp <= @cutoff")
      .dropna(subset=["timestamp"])
      .drop_duplicates("post_id")
      .loc[:, COLS]                 # keep only the desired columns
)

In [6]:
print("Rows:", len(df_cn))
print(df_cn)

Rows: 21928
        post_id                                               text  \
0          cn_0  15.5.5的暴跌终于来啦，人们似乎已然看懂啦这样的跌，却看不懂专家大媒们各类稀奇古怪驴唇不...   
1          cn_1  数据及策略 2015年5月5日沪深300日先行指标为第一波快升后的回落；日技术指标为第一波快...   
2          cn_2  牛市震荡要敢于高抛低吸 作者：荷叶 5月5日，沪深股市暴跌，沪深300跌3.99%，上证指数...   
3          cn_3  5月5日，A股市场遭遇今年以来罕见的大幅回调。 从此三大股指期货表现来看，回调风险已经有所暴...   
4          cn_4  达尔金融周二期指分析：三大期指集体下挫，现货走势弱于期货，中证500期指由于此前贴水较多跌幅...   
...         ...                                                ...   
21923  cn_21923  老唐复盘：说好的休息2合约，在机会年前还是忍不住，本周容许3次试仓：画线开空2次止损3个点，...   
21924  cn_21924              北京房产，抗跌性极好，城市中心化，三四线乡镇化，做为一线的北京，，无可挑剔   
21925  cn_21925  1，4090一线做空，止损4105，止盈40702，4070附近做多，止损 4055，止盈4...   
21926  cn_21926                                        明天继续攻击的概率高。   
21927  cn_21927  2020年沪深300指数一定会击穿3200点，打八折，唯一问题是上半年破还是下半年破，达到2...   

                      timestamp lang  
0     2015-05-05 20:13:21+00:00   zh  
1     2015-05-06 07:15:31+00:00   zh  
2     2015-05-06 08:04:48+00:0

In [7]:
### same chronological block
overall_start = min(df_en["timestamp"].min(), df_cn["timestamp"].min())
overall_end = max(df_en["timestamp"].max(), df_cn["timestamp"].max())
print(f"Start: {overall_start}, end: {overall_end}")

df_en["lang"] = "en"
df_cn["lang"] = "cn"
corpus = (pd.concat([df_en, df_cn], ignore_index=True)
          .sort_values("timestamp")
          .reset_index(drop=True))
print(f"Length = {len(corpus)}")

Start: 2015-01-01 07:49:52+00:00, end: 2019-12-31 21:04:38+00:00
Length = 32386


# Preprocessing

In [8]:
import pandas as pd
from datetime import time
import re

import nltk
import jieba

In [9]:
## align time
# assign each post to the nearest date
def align_date_noon(ts):
    # if tz-aware, drop tz info so ts.time() works
    if ts.tzinfo is not None:
        ts = ts.tz_convert(None)
    # compare to noon
    if ts.time() <= time(12, 0, 0):
        return ts.date()
    else:
        return (ts + pd.Timedelta(days=1)).date()

for df in (df_en, df_cn):
    # ensure datetime, convert UTC and drop tz
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True).dt.tz_localize(None)
    # apply the alignment
    df['timestamp'] = df['timestamp'].apply(align_date_noon)

# Tokenize and Add Sentiment Labels

In [10]:
## English: VADER + Loughran-McDonald
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# VADER
sid = SentimentIntensityAnalyzer()

# Loughran-McDonald
lm_df = pd.read_csv("data/raw/Loughran-McDonald_MasterDictionary_1993-2024.csv")
# words with "Positive" > 0
lm_pos = set(lm_df.loc[lm_df['Positive'] > 0, 'Word'].str.lower())
# words with "Negative" > 0
lm_neg = set(lm_df.loc[lm_df['Negative'] > 0, 'Word'].str.lower())

In [11]:
# use TweetTokenizer
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer(preserve_case=False)

In [12]:
# combine sentiment function
def en_sentiment(text):
    # VADER score
    vader_c = sid.polarity_scores(text)['compound']

    # tokenize with TweetTokenizer
    tokens = tt.tokenize(text)

    # count lexicons
    lm_s = sum(tok in lm_pos for tok in tokens) - sum(tok in lm_neg for tok in tokens)

    # combine
    raw = vader_c + lm_s
    return 1 if raw > 0 else (-1 if raw < 0 else 0)

df_en['label'] = df_en['text'].apply(en_sentiment)

In [13]:
## Chinese: DLUT + BostonNLP
dlut_df = (pd.read_csv("data/raw/dlut_emotions.csv", dtype={"id": str}, on_bad_lines="skip"))
dlut_df.head()

Unnamed: 0,词语,词性种类,词义数,词义序号,情感分类,强度,极性,辅助情感分类,强度.1,极性.1,Unnamed: 11,.1,.2
0,脏乱,adj,1.0,1.0,NN,7.0,2.0,,,,,,
1,糟报,adj,1.0,1.0,NN,5.0,2.0,,,,,,
2,早衰,adj,1.0,1.0,NE,5.0,2.0,,,,,,
3,责备,verb,1.0,1.0,NN,5.0,2.0,,,,,,
4,贼眼,noun,1.0,1.0,NN,5.0,2.0,,,,,,


In [14]:
# 0 = word, 4 = sentiment category, 5 = intensity
lex_df = (dlut_df.iloc[:, [0, 4, 5]].drop_duplicates())
lex_df.columns = ['word', 'emotion', 'intensity']

# P... = positive emotion
# N... = negative emotion
def signed_intensity(row):
    emo = str(row['emotion']).strip()
    inten = float(row['intensity'])

    # if emotion starts with "N", treat intensity as negative
    if emo.upper().startswith('N'):
        return -inten
    else:
        return inten
lex_df['signed_intensity'] = lex_df.apply(signed_intensity, axis=1)
lex_df.head()

Unnamed: 0,word,emotion,intensity,signed_intensity
0,脏乱,NN,7.0,-7.0
1,糟报,NN,5.0,-5.0
2,早衰,NE,5.0,-5.0
3,责备,NN,5.0,-5.0
4,贼眼,NN,5.0,-5.0


In [15]:
# dictionary: word -> signed intensity
lex = dict(zip(lex_df['word'], lex_df['signed_intensity']))

# sentiment function
def cn_sentiment(text):
    tokens = jieba.cut(str(text))
    total_score = sum(lex.get(tok, 0) for tok in tokens)
    if total_score > 0:
        return 1
    elif total_score < 0:
        return -1
    else:
        return 0

df_cn['label'] = df_cn['text'].apply(cn_sentiment)

Building prefix dict from C:\Users\bb\anaconda3\Lib\site-packages\jieba\dict.txt ...
Loading model from cache C:\Users\bb\AppData\Local\Temp\jieba.cache
Loading model cost 0.32999444007873535 seconds.
Prefix dict has been built succesfully.


In [16]:
df_en.to_csv("data/processed/df_en_processed.csv", index=False)
df_en

Unnamed: 0,post_id,text,timestamp,lang,label
169,550559515515842561,Les leaders de 2014:- Dow Jones: $INTC +41%; $...,2015-01-01,en,0
191,550608100504260610,Weekly S&P500 #Stocks Performance $NBR $GM $QE...,2015-01-01,en,0
543,550806376092807168,perfectly trading the S&P 500 in 2014 $FB $MU ...,2015-01-02,en,1
1331,551058514500521985,S&P500 #Stocks Performance $VRTX $LH $DGX $NFL...,2015-01-03,en,0
3318,552111348130131969,"Technology EPS Growth Will Beat S&P 500, Says ...",2015-01-06,en,1
...,...,...,...,...,...
3325409,1212055545554898946,$TSLA to be included in #SP500 $SPY in 2020?,2020-01-01,en,0
3325544,1212067050409070595,These 2 stocks dominated S&P 500 returns in 20...,2020-01-01,en,0
3325624,1212077265997238274,These 2 stocks dominated S&P 500 returns in 20...,2020-01-01,en,0
3325888,1212108424575700993,$ABMD $ALGN $AMZN NEW ARTICLE : 10 Best Perfor...,2020-01-01,en,1


In [18]:
df_cn = df_cn.drop_duplicates(subset="text")
df_cn.to_csv("data/processed/df_cn_processed.csv", index=False)
df_cn

Unnamed: 0,post_id,text,timestamp,lang,label
0,cn_0,15.5.5的暴跌终于来啦，人们似乎已然看懂啦这样的跌，却看不懂专家大媒们各类稀奇古怪驴唇不...,2015-05-06,cn,1
1,cn_1,数据及策略 2015年5月5日沪深300日先行指标为第一波快升后的回落；日技术指标为第一波快...,2015-05-06,cn,1
2,cn_2,牛市震荡要敢于高抛低吸 作者：荷叶 5月5日，沪深股市暴跌，沪深300跌3.99%，上证指数...,2015-05-06,cn,1
3,cn_3,5月5日，A股市场遭遇今年以来罕见的大幅回调。 从此三大股指期货表现来看，回调风险已经有所暴...,2015-05-06,cn,1
4,cn_4,达尔金融周二期指分析：三大期指集体下挫，现货走势弱于期货，中证500期指由于此前贴水较多跌幅...,2015-05-06,cn,1
...,...,...,...,...,...
21923,cn_21923,老唐复盘：说好的休息2合约，在机会年前还是忍不住，本周容许3次试仓：画线开空2次止损3个点，...,2019-12-31,cn,0
21924,cn_21924,北京房产，抗跌性极好，城市中心化，三四线乡镇化，做为一线的北京，，无可挑剔,2019-12-31,cn,1
21925,cn_21925,1，4090一线做空，止损4105，止盈40702，4070附近做多，止损 4055，止盈4...,2020-01-01,cn,0
21926,cn_21926,明天继续攻击的概率高。,2020-01-01,cn,-1
