## Integrating Media with Stock Data 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import media_integration as mi

# Load Data... 

In [2]:
MENTIONS_CSV = "../media/media_data/output/gkg_company_timeseries.csv"  # Your mentions CSV file
STOCK_DATA_DIR = "./data/input/company_trades"  # Directory with stock CSV files
TICKERS = ['AMSC', 'BP', 'EVR', 'GOOGL', 'GTXI', 'HLF', 'MDRX', 'ORCL', 'SPPI', 'WFC']
OUTPUT_FILE = "media_stock_features.csv"

engineer = mi.MediaStockFeatureEngineer(
        mentions_csv=MENTIONS_CSV,
        stock_data_dir=STOCK_DATA_DIR,
        tickers=TICKERS
    )

# Start Feature Engineering Pipeline

In [3]:
mentions_df = engineer.load_mentions_data()
stock_df = engineer.load_stock_data()

2025-11-29 13:06:50,083 - INFO - Loading mentions data from ../media/media_data/output/gkg_company_timeseries.csv
2025-11-29 13:06:50,114 - INFO - Loaded 2794 mention records
2025-11-29 13:06:50,118 - INFO - Date range: 2015-02-19 00:00:00+00:00 to 2025-11-18 00:00:00+00:00
2025-11-29 13:06:50,121 - INFO - Tickers: ['EVR', 'GOOGL', 'SPPI', 'HLF', 'WFC', 'ORCL', 'MDRX', 'NP', 'GTXI', 'AMSC']
2025-11-29 13:06:50,122 - INFO - Loading stock data from data\input\company_trades
2025-11-29 13:06:50,124 - INFO - Loading AMSC from AMSC_2015_2025.csv
2025-11-29 13:06:50,176 - INFO - Loading BP from BP_2015_2025.csv
2025-11-29 13:06:50,200 - INFO - Loading EVR from EVR_2015_2025.csv
2025-11-29 13:06:50,234 - INFO - Loading GOOGL from GOOGL_2015_2025.csv
2025-11-29 13:06:50,257 - INFO - Loading HLF from HLF_2015_2025.csv
2025-11-29 13:06:50,277 - INFO - Loading MDRX from MDRX_2015_2025.csv
2025-11-29 13:06:50,306 - INFO - Loading ORCL from ORCL_2015_2025.csv
2025-11-29 13:06:50,328 - INFO - Loadin

In [4]:
df = engineer.merge_data(mentions_df, stock_df)
df.head()

2025-11-29 13:06:52,622 - INFO - Merging mentions and stock data...
2025-11-29 13:06:52,649 - INFO - Stock date range: 2015-01-02 05:00:00 to 2025-11-18 05:00:00
2025-11-29 13:06:52,652 - INFO - Mentions date range: 2015-02-19 00:00:00 to 2025-11-18 00:00:00
2025-11-29 13:06:52,659 - INFO - After mentions merge: 21895 records, 0 with mentions
2025-11-29 13:06:52,669 - INFO - Final merged data: 21895 records


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,ArticleCount,Tone,Polarity,WordCount
0,2015-01-02 05:00:00,AMSC,7.5,8.2,7.4,8.2,42430,0.0,0.0,0.0,0.0
1,2015-01-05 05:00:00,AMSC,7.9,8.2,7.6,7.9,54140,0.0,0.0,0.0,0.0
2,2015-01-06 05:00:00,AMSC,7.8,8.0,7.7,8.0,77570,0.0,0.0,0.0,0.0
3,2015-01-07 05:00:00,AMSC,8.0,8.1,7.6,8.0,39980,0.0,0.0,0.0,0.0
4,2015-01-08 05:00:00,AMSC,7.9,8.2,7.7,8.2,34100,0.0,0.0,0.0,0.0


In [5]:
df = engineer.create_target_variable(df)
df.head()


2025-11-29 13:06:59,824 - INFO - Creating target variable (Volume_Next_Day)...
2025-11-29 13:06:59,834 - INFO - Target variable created. 8 missing values (last day per ticker)


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,ArticleCount,Tone,Polarity,WordCount
0,2015-01-02 05:00:00,AMSC,7.5,8.2,7.4,8.2,54140.0,0.0,0.0,0.0,0.0
1,2015-01-05 05:00:00,AMSC,7.9,8.2,7.6,7.9,77570.0,0.0,0.0,0.0,0.0
2,2015-01-06 05:00:00,AMSC,7.8,8.0,7.7,8.0,39980.0,0.0,0.0,0.0,0.0
3,2015-01-07 05:00:00,AMSC,8.0,8.1,7.6,8.0,34100.0,0.0,0.0,0.0,0.0
4,2015-01-08 05:00:00,AMSC,7.9,8.2,7.7,8.2,49500.0,0.0,0.0,0.0,0.0


In [None]:
media_cols = ['ArticleCount', 'Tone', 'Polarity', 'WordCount']
stock_cols = ['Volume', 'Close']
df = engineer.create_lag_features(df, media_cols + stock_cols, lags=[1, 7, 30])

# Create rolling features
df = engineer.create_rolling_features(df, media_cols + stock_cols, window=30)

# Create temporal features
df = engineer.create_temporal_features(df)

# Sort by date and ticker
df = df.sort_values(['Date', 'Ticker']).reset_index(drop=True)


2025-11-29 13:08:17,949 - INFO - Creating lag features for ['ArticleCount', 'Tone', 'Polarity', 'WordCount', 'Volume', 'Close']...
2025-11-29 13:08:17,993 - INFO - Creating 30-day rolling window features...
2025-11-29 13:08:18,149 - INFO - Creating temporal features...


                     Date  Ticker        Open        High         Low  \
0     2015-01-02 05:00:00       0    7.500000    8.200000    7.400000   
1     2015-01-02 05:00:00       1   20.000239   20.083988   19.748994   
2     2015-01-02 05:00:00       2   41.628164   41.935645   40.823984   
3     2015-01-02 05:00:00       3   26.447589   26.606494   26.213205   
4     2015-01-02 05:00:00       4   18.915001   19.090000   18.549999   
...                   ...     ...         ...         ...         ...   
21890 2025-11-18 05:00:00       2  293.190002  301.704987  294.190002   
21891 2025-11-18 05:00:00       3  287.859985  288.799988  278.230011   
21892 2025-11-18 05:00:00       4    8.730000    9.010000    8.660000   
21893 2025-11-18 05:00:00       6  216.205002  223.679993  214.500000   
21894 2025-11-18 05:00:00       7   83.415001   84.760002   82.930000   

            Close      Volume  ArticleCount  Tone  Polarity  ...  \
0        8.200000     54140.0           0.0   0.0      

# Save to CSV for LSTM

In [11]:
df.head()
df.to_csv(OUTPUT_FILE, index=False)