In [None]:
# 트럼프 감성분석, 회귀분석

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import statsmodels.api as sm
from scipy.stats import ttest_ind

path_tweets = r"C:\Users\shinchaewon\Desktop\텍스트마이닝\trump_tweets_2016_2021_final.csv"
path_spx    = r"C:\Users\shinchaewon\Desktop\텍스트마이닝\SPXUSD_filtered.xlsx"

horizons = [1, 10, 20, 30, 45, 60, 120]

start_date = pd.Timestamp("2016-11-08", tz="UTC")
end_date   = pd.Timestamp("2019-09-22 23:59:59", tz="UTC")  # inclusive

# VADER 친화 전처리 (신호 보존)
#   - 유지: 대문자, !!!, ???, 반복부호, 조동사, 강조부사, 리듬, 이모지/구두점 대부분
#   - 제거: URL, @멘션, RT(리트윗 토큰), (옵션) 앞뒤 공백만 정리
nltk.download("vader_lexicon")
sid = SentimentIntensityAnalyzer()

url_pat = re.compile(r"(https?://\S+|www\.\S+)", flags=re.IGNORECASE)
mention_pat = re.compile(r"@\w+")
# RT를 문장 내 의미단어로 보지 않게 제거 (대문자/리듬 유지 위해 과격한 정규화 X)
rt_pat = re.compile(r"(^|\s)RT(\s|:)", flags=re.IGNORECASE)
# 공백만 정리 (구두점/대문자/반복부호는 그대로 둠)
multi_space_pat = re.compile(r"\s+")

def clean_for_vader(text):
    if pd.isna(text):
        return ""
    t = str(text)

    # URL 제거 (VADER에 의미 없음)
    t = url_pat.sub(" ", t)

    # 멘션 제거 (사용자 핸들은 의미 거의 없음)
    t = mention_pat.sub(" ", t)

    # RT 토큰만 제거 (리듬/구두점 최대 보존)
    t = rt_pat.sub(" ", t)

    # 해시태그는 #만 빼고 단어는 살림 (감성 단어일 수 있음)
    # 예: "#GREAT" -> "GREAT"
    t = t.replace("#", "")

    # 공백만 정리 (문장 부호/대문자/반복부호 유지)
    t = multi_space_pat.sub(" ", t).strip()

    return t

def get_sentiment(text):
    if pd.isna(text) or str(text).strip() == "":
        return 0.0
    return sid.polarity_scores(str(text))["compound"]


# 1) 트윗 로드 + 기간 필터 + VADER 감성
df = pd.read_csv(path_tweets, encoding="utf-8")

df["datetime_utc"] = pd.to_datetime(df["datetime_utc"], errors="coerce")
df = df.dropna(subset=["datetime_utc"]).copy()

# 항상 UTC tz-aware로 통일
df["datetime_utc"] = pd.to_datetime(df["datetime_utc"], utc=True)

# 논문 기간 필터 (UTC 기준)
df = df[(df["datetime_utc"] >= start_date) & (df["datetime_utc"] <= end_date)].copy()

# VADER 친화 전처리 + 감성
df["text_clean"] = df["text"].astype(str).apply(clean_for_vader)
df["sentiment"] = df["text_clean"].apply(get_sentiment)

# UTC → NY (tz-aware)
df["dt_ny"] = df["datetime_utc"].dt.tz_convert("America/New_York")

# merge용 naive
df["dt_ny_naive"] = df["dt_ny"].dt.tz_localize(None)

# 2) SPX 분봉 로드 + 기간 필터 + 정렬
spx = pd.read_excel(path_spx)

spx["datetime"] = pd.to_datetime(spx["datetime"], errors="coerce")
spx = spx.dropna(subset=["datetime"]).copy()
spx = spx.rename(columns={"datetime": "dt_ny_naive"})

start_ny_naive = start_date.tz_convert("America/New_York").tz_localize(None)
end_ny_naive   = end_date.tz_convert("America/New_York").tz_localize(None)

spx = spx[(spx["dt_ny_naive"] >= start_ny_naive) & (spx["dt_ny_naive"] <= end_ny_naive)].copy()

df = df.sort_values("dt_ny_naive")
spx = spx.sort_values("dt_ny_naive")


# 3) 트윗에 현재 close 붙이기 (backward)
merged = pd.merge_asof(
    df,
    spx[["dt_ny_naive", "close"]],
    on="dt_ny_naive",
    direction="backward"
)

print(merged[["dt_ny_naive", "text_clean", "sentiment", "close"]].head(10))

# 4) horizons별 "h분 뒤" 수익률 만들기
spx_base = spx[["dt_ny_naive", "close"]].copy().sort_values("dt_ny_naive")
spx_base = spx_base.rename(columns={"close": "close_t"})

spx_lookup = spx[["dt_ny_naive", "close"]].copy().sort_values("dt_ny_naive")
spx_lookup = spx_lookup.rename(columns={"close": "close_future"})

for h in horizons:
    tmp = spx_base.copy()
    tmp[f"dt_plus_{h}"] = tmp["dt_ny_naive"] + pd.Timedelta(minutes=h)

    tmp = tmp.merge(
        spx_lookup.rename(columns={"dt_ny_naive": f"dt_plus_{h}"}),
        on=f"dt_plus_{h}",
        how="left"
    )

    tmp = tmp.rename(columns={"close_future": f"close_t_plus_{h}"})
    spx_base[f"close_t_plus_{h}"] = tmp[f"close_t_plus_{h}"]
    spx_base[f"ret_{h}min"] = spx_base[f"close_t_plus_{h}"] / spx_base["close_t"] - 1

# 5) 트윗에 horizons별 수익률 붙이기 (backward)
merged_ret = pd.merge_asof(
    df.sort_values("dt_ny_naive"),
    spx_base.sort_values("dt_ny_naive"),
    on="dt_ny_naive",
    direction="backward"
)

print(merged_ret[["dt_ny_naive", "sentiment"] + [f"ret_{h}min" for h in horizons]].head())

# 6) horizons별 회귀: ret_h ~ sentiment (HC3)
reg_rows = []
for h in horizons:
    ycol = f"ret_{h}min"
    tmp = merged_ret.dropna(subset=[ycol, "sentiment"]).copy()

    X = sm.add_constant(tmp["sentiment"])
    y = tmp[ycol]
    model = sm.OLS(y, X).fit(cov_type="HC3")

    reg_rows.append({
        "horizon_min": h,
        "n_obs": int(model.nobs),
        "beta_sentiment": float(model.params["sentiment"]),
        "t_sentiment": float(model.tvalues["sentiment"]),
        "p_sentiment": float(model.pvalues["sentiment"]),
        "R2": float(model.rsquared),
    })

results_df = pd.DataFrame(reg_rows).sort_values("horizon_min")
print("\n=== horizons별 회귀 요약 (ret_h ~ sentiment) ===")
print(results_df)

# 7) horizons별 t-test: sentiment>0 vs sentiment<0
tt_rows = []
pos = merged_ret[merged_ret["sentiment"] > 0]
neg = merged_ret[merged_ret["sentiment"] < 0]

print("\n긍정 트윗 개수:", len(pos), "부정 트윗 개수:", len(neg))

for h in horizons:
    ycol = f"ret_{h}min"
    pos_h = pos[ycol].dropna()
    neg_h = neg[ycol].dropna()

    if len(pos_h) < 2 or len(neg_h) < 2:
        tt_rows.append({
            "horizon_min": h,
            "pos_n": len(pos_h),
            "neg_n": len(neg_h),
            "t_stat": np.nan,
            "p_value": np.nan,
            "pos_mean": pos_h.mean() if len(pos_h) else np.nan,
            "neg_mean": neg_h.mean() if len(neg_h) else np.nan,
        })
        continue

    t_stat, p_val = ttest_ind(pos_h, neg_h, equal_var=False)
    tt_rows.append({
        "horizon_min": h,
        "pos_n": len(pos_h),
        "neg_n": len(neg_h),
        "t_stat": float(t_stat),
        "p_value": float(p_val),
        "pos_mean": float(pos_h.mean()),
        "neg_mean": float(neg_h.mean()),
    })

tt_df = pd.DataFrame(tt_rows).sort_values("horizon_min")
print("\n=== horizons별 t-test (positive vs negative sentiment) ===")
print(tt_df)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\shinchaewon\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


          dt_ny_naive                                         text_clean  \
0 2016-11-08 01:42:00  Today we are going to win the great state of M...   
1 2016-11-08 06:43:00                 TODAY WE MAKE AMERICA GREAT AGAIN!   
2 2016-11-08 11:39:00  VOTE TODAY! Go to to find your polling locatio...   
3 2016-11-08 13:03:00  We need your vote. Go to the POLLS! Let's cont...   
4 2016-11-08 13:23:00                                        ElectionDay   
5 2016-11-08 16:18:00  I will be watching the election results from T...   
6 2016-11-08 16:28:00  Just out according to : "Utah officials report...   
7 2016-11-08 16:31:00  Don't let up, keep getting out to vote - this ...   
8 2016-11-08 18:03:00      Still time to VoteTrump! iVoted ElectionNight   
9 2016-11-08 21:48:00  Watching the returns at 9:45pm. ElectionNight ...   

   sentiment    close  
0     0.9478  2127.25  
1     0.6588  2124.75  
2     0.6892  2134.50  
3     0.0000  2141.00  
4     0.0000  2138.50  
5     0.7836  2135.

In [11]:
# ================================
# 2017년
# ================================

horizons = [1, 10, 20, 30, 45, 60, 120]

start_2017 = pd.Timestamp("2017-01-01 00:00:00", tz="UTC")
end_2017   = pd.Timestamp("2017-12-31 23:59:59", tz="UTC")

# ----------------------------
# 1) 트윗: 2017년만 (UTC 기준) 필터
# ----------------------------
df_2017 = df.copy()

df_2017["datetime_utc"] = pd.to_datetime(df_2017["datetime_utc"], utc=True, errors="coerce")
df_2017 = df_2017.dropna(subset=["datetime_utc"]).copy()

df_2017 = df_2017[(df_2017["datetime_utc"] >= start_2017) & (df_2017["datetime_utc"] <= end_2017)].copy()

if "dt_ny_naive" not in df_2017.columns:
    df_2017["dt_ny"] = df_2017["datetime_utc"].dt.tz_convert("America/New_York")
    df_2017["dt_ny_naive"] = df_2017["dt_ny"].dt.tz_localize(None)

# 감성(전처리 + vader)
df_2017["text_clean"] = df_2017["text"].astype(str).apply(clean_for_vader)
df_2017["sentiment"] = df_2017["text_clean"].apply(get_sentiment)

df_2017 = df_2017.sort_values("dt_ny_naive")

# ----------------------------
# 2) SPX: 2017년만 (NY naive 기준) 필터
# ----------------------------
spx_2017 = spx.copy()
spx_2017["dt_ny_naive"] = pd.to_datetime(spx_2017["dt_ny_naive"], errors="coerce")
spx_2017 = spx_2017.dropna(subset=["dt_ny_naive"]).copy()

start_ny = start_2017.tz_convert("America/New_York").tz_localize(None)
end_ny   = end_2017.tz_convert("America/New_York").tz_localize(None)

spx_2017 = spx_2017[(spx_2017["dt_ny_naive"] >= start_ny) & (spx_2017["dt_ny_naive"] <= end_ny)].copy()
spx_2017 = spx_2017.sort_values("dt_ny_naive")

# ----------------------------
# 3)트윗에 현재 close 붙이기
merged_2017 = pd.merge_asof(
    df_2017,
    spx_2017[["dt_ny_naive", "close"]],
    on="dt_ny_naive",
    direction="backward"
)

# ----------------------------
# 4) 2017년 ret_{h}min 만들기
spx_base_2017 = spx_2017[["dt_ny_naive", "close"]].copy().sort_values("dt_ny_naive")
spx_base_2017 = spx_base_2017.rename(columns={"close": "close_t"})

spx_lookup_2017 = spx_2017[["dt_ny_naive", "close"]].copy().sort_values("dt_ny_naive")
spx_lookup_2017 = spx_lookup_2017.rename(columns={"close": "close_future"})

for h in horizons:
    tmp = spx_base_2017[["dt_ny_naive", "close_t"]].copy()
    tmp[f"dt_plus_{h}"] = tmp["dt_ny_naive"] + pd.Timedelta(minutes=h)

    tmp = tmp.merge(
        spx_lookup_2017.rename(columns={"dt_ny_naive": f"dt_plus_{h}"}),
        on=f"dt_plus_{h}",
        how="left"
    )

    spx_base_2017[f"close_t_plus_{h}"] = tmp["close_future"]
    spx_base_2017[f"ret_{h}min"] = spx_base_2017[f"close_t_plus_{h}"] / spx_base_2017["close_t"] - 1

# ----------------------------
# 5) 트윗에 2017 ret 붙이기
merged_ret_2017 = pd.merge_asof(
    df_2017.sort_values("dt_ny_naive"),
    spx_base_2017.sort_values("dt_ny_naive"),
    on="dt_ny_naive",
    direction="backward"
)

# ----------------------------
# 6) 2017 회귀: ret_h ~ sentiment (HC3)
reg_rows = []
for h in horizons:
    ycol = f"ret_{h}min"
    tmp = merged_ret_2017.dropna(subset=[ycol, "sentiment"]).copy()

    X = sm.add_constant(tmp["sentiment"])
    y = tmp[ycol]
    model = sm.OLS(y, X).fit(cov_type="HC3")

    reg_rows.append({
        "horizon_min": h,
        "n_obs": int(model.nobs),
        "beta_sentiment": float(model.params["sentiment"]),
        "t_sentiment": float(model.tvalues["sentiment"]),
        "p_sentiment": float(model.pvalues["sentiment"]),
        "R2": float(model.rsquared),
    })

results_2017 = pd.DataFrame(reg_rows).sort_values("horizon_min")
print("\n=== 2017 horizons별 회귀 요약 (ret_h ~ sentiment) ===")
print(results_2017)

# ----------------------------
# 7) 2017 t-test: sentiment>0 vs sentiment<0
tt_rows = []
pos = merged_ret_2017[merged_ret_2017["sentiment"] > 0]
neg = merged_ret_2017[merged_ret_2017["sentiment"] < 0]

print("\n[2017] 긍정 트윗 개수:", len(pos), "부정 트윗 개수:", len(neg))

for h in horizons:
    ycol = f"ret_{h}min"
    pos_h = pos[ycol].dropna()
    neg_h = neg[ycol].dropna()

    if len(pos_h) < 2 or len(neg_h) < 2:
        tt_rows.append({
            "horizon_min": h,
            "pos_n": len(pos_h),
            "neg_n": len(neg_h),
            "t_stat": np.nan,
            "p_value": np.nan,
            "pos_mean": pos_h.mean() if len(pos_h) else np.nan,
            "neg_mean": neg_h.mean() if len(neg_h) else np.nan,
        })
        continue

    t_stat, p_val = ttest_ind(pos_h, neg_h, equal_var=False)
    tt_rows.append({
        "horizon_min": h,
        "pos_n": len(pos_h),
        "neg_n": len(neg_h),
        "t_stat": float(t_stat),
        "p_value": float(p_val),
        "pos_mean": float(pos_h.mean()),
        "neg_mean": float(neg_h.mean()),
    })

tt_2017 = pd.DataFrame(tt_rows).sort_values("horizon_min")
print("\n=== 2017 horizons별 t-test (positive vs negative sentiment) ===")
print(tt_2017)



=== 2017 horizons별 회귀 요약 (ret_h ~ sentiment) ===
   horizon_min  n_obs  beta_sentiment  t_sentiment  p_sentiment        R2
0            1   1486        0.001540     2.004238     0.045045  0.002739
1           10   1458        0.001410     1.833874     0.066673  0.002317
2           20   1405        0.001084     1.402843     0.160664  0.001401
3           30   1406        0.001231     1.620329     0.105162  0.001863
4           45   1373        0.001441     1.864339     0.062274  0.002544
5           60   1458        0.001704     2.332269     0.019687  0.003612
6          120   1185        0.000776     0.955712     0.339218  0.000756

[2017] 긍정 트윗 개수: 1336 부정 트윗 개수: 668

=== 2017 horizons별 t-test (positive vs negative sentiment) ===
   horizon_min  pos_n  neg_n    t_stat   p_value  pos_mean  neg_mean
0            1    855    426  1.770047  0.077072  0.028892  0.027170
1           10    811    439  1.786405  0.074363  0.028637  0.026911
2           20    799    414  1.319130  0.187482  