In [2]:
import pandas as pd

base_url = "https://raw.githubusercontent.com/invigoworks-knu/ai-news-sentiment-analysis/main/news_list_labeled_with_date/page{}.csv"
dfs = []
for i in range(1, 857):
    url = base_url.format(i)
    try:
        df = pd.read_csv(url)
        dfs.append(df)
    except Exception as e:
        print(f"Error loading page{i}.csv: {e}")

combined_df = pd.concat(dfs, ignore_index=True)
combined_df['date'] = pd.to_datetime(combined_df['date'], errors='coerce').dt.normalize()
combined_df = combined_df.dropna(subset=['date']).copy()
if 'label' not in combined_df.columns:
    raise ValueError("label column not found")
combined_df['label'] = pd.to_numeric(combined_df['label'], errors='coerce')
combined_df = combined_df.dropna(subset=['label']).copy()
combined_df['is_pos'] = (combined_df['label'] > 0).astype(int)
combined_df['is_neg'] = (combined_df['label'] < 0).astype(int)
combined_df['is_neu'] = (combined_df['label'] == 0).astype(int)
if 'title' in combined_df.columns:
    combined_df['title_len'] = combined_df['title'].astype(str).str.len()
else:
    combined_df['title_len'] = 0
daily = combined_df.groupby('date').agg(
    news_count = ('label', 'count'),
    sum_sentiment = ('label', 'sum'),
    mean_sentiment = ('label', 'mean'),
    std_sentiment = ('label', 'std'),
    pos_ratio = ('is_pos', 'mean'),
    neg_ratio = ('is_neg', 'mean'),
    neu_ratio = ('is_neu', 'mean'),
    avg_title_len = ('title_len', 'mean')
).reset_index().sort_values('date')
daily['std_sentiment'] = daily['std_sentiment'].fillna(0.0)
for k in [3, 7, 14]:
    daily[f'roll_mean_sent_{k}'] = daily['mean_sentiment'].shift(1).rolling(window=k, min_periods=1).mean()
    daily[f'roll_std_sent_{k}']  = daily['mean_sentiment'].shift(1).rolling(window=k, min_periods=1).std().fillna(0.0)
    daily[f'roll_sum_count_{k}'] = daily['news_count'].shift(1).rolling(window=k, min_periods=1).sum()
daily['sent_lag_1'] = daily['mean_sentiment'].shift(1).fillna(0.0)
daily['weekday'] = pd.to_datetime(daily['date']).dt.weekday
daily = daily.fillna(0.0)

daily = daily[daily['date'] >= pd.to_datetime("2020-01-01")].reset_index(drop=True)

daily.to_csv("daily_sentiment.csv", index=False)
print(daily.head())


        date  news_count  sum_sentiment  mean_sentiment  std_sentiment  \
0 2020-01-01           2             -2       -1.000000       0.000000   
1 2020-01-02           2              0        0.000000       0.000000   
2 2020-01-03           3             -1       -0.333333       1.154701   
3 2020-01-04           5              5        1.000000       0.000000   
4 2020-01-05           2              2        1.000000       0.000000   

   pos_ratio  neg_ratio  neu_ratio  avg_title_len  roll_mean_sent_3  \
0   0.000000   1.000000        0.0            0.0          0.368280   
1   0.000000   0.000000        1.0            0.0         -0.083333   
2   0.333333   0.666667        0.0            0.0         -0.416667   
3   1.000000   0.000000        0.0            0.0         -0.444444   
4   1.000000   0.000000        0.0            0.0          0.222222   

   roll_std_sent_3  roll_sum_count_3  roll_mean_sent_7  roll_std_sent_7  \
0         0.625108              36.0          0.36828