<a href="https://colab.research.google.com/github/kamaleshpantra/StockPulse/blob/main/Stockpulse_trend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import subprocess
import sys

def install_packages():
    packages = [
        "pyflink",
        "praw==7.8.1",
        "yfinance==0.2.54",
        "streamlit==1.43.2",
        "pyngrok==7.2.3",
        "nltk==3.9.1",
        "pandas==2.2.2",
        "matplotlib==3.10.0",
        "plotly==5.24.1",
        "scikit-learn==1.6.1",
        "tensorflow==2.18.0"
    ]
    for pkg in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
            print(f"Installed {pkg}")
        except subprocess.CalledProcessError as e:
            print(f"Failed to install {pkg}: {e}")
            raise

install_packages()
print("All dependencies installed successfully!")

Installed pyflink
Installed praw==7.8.1
Installed yfinance==0.2.54
Installed streamlit==1.43.2
Installed pyngrok==7.2.3
Installed nltk==3.9.1
Installed pandas==2.2.2
Installed matplotlib==3.10.0
Installed plotly==5.24.1
Installed scikit-learn==1.6.1
Installed tensorflow==2.18.0
All dependencies installed successfully!


In [2]:
import os
import logging
import nltk
import praw
import yfinance as yf
from pyflink.table import EnvironmentSettings, TableEnvironment, StreamTableEnvironment
from pyflink.table.udf import udf
from pyflink.datastream import StreamExecutionEnvironment
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import json
from datetime import datetime, timedelta
import time
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import plotly.graph_objects as go
import streamlit as st
from pyngrok import ngrok
from google.colab import userdata

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    filename="/content/stock_predict.log",
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Download NLTK data
nltk.download('vader_lexicon', quiet=True)

# Define sector
sectors = {"Technology": ["AAPL", "MSFT", "GOOGL", "TSLA"]}

# Initialize Reddit API with Colab Secrets
try:
    client_id = userdata.get("REDDIT_CLIENT_ID")
    client_secret = userdata.get("REDDIT_CLIENT_SECRET")
    user_agent = userdata.get("REDDIT_USER_AGENT") or "StockSentiment/1.0"
    if not all([client_id, client_secret, user_agent]):
        raise ValueError("One or more Reddit credentials are missing")
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent,
        read_only=True,
        requestor_kwargs={"timeout": 10}
    )
    limits = reddit.auth.limits
    logger.info("Reddit API initialized. Rate limit remaining: %s", limits.get("remaining", "unknown"))
    print("Reddit API initialized")
except Exception as e:
    logger.error(f"Failed to initialize Reddit API: {e}")
    raise ValueError(
        "Reddit API setup failed. Ensure REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, "
        "and REDDIT_USER_AGENT are set in Colab Secrets."
    )

# Initialize VADER
sid = SentimentIntensityAnalyzer()

# Define date range
end_date = datetime.now().date()
start_date = end_date - timedelta(days=365)

# Helper for trading days
def next_trading_day(date):
    """Map to next trading day (skip weekends)."""
    date = pd.to_datetime(date)
    while date.weekday() >= 5:  # Saturday or Sunday
        date += timedelta(days=1)
    return date.date()

logger.info("Setup complete")
print("Setup complete!")
print(start_date,end_date)

Reddit API initialized
Setup complete!
2024-04-16 2025-04-16


In [3]:
import os
import logging
import json
import time
from datetime import datetime, timedelta

def fetch_large_reddit(sector, subreddits=["wallstreetbets", "stocks", "investing"], limit_per_query=500):
    """
    Fetch Reddit posts for sector companies within date range, ensuring relevance.

    Args:
        sector (str): Sector name (e.g., 'Technology').
        subreddits (list): Subreddits to search.
        limit_per_query (int): Max posts per subreddit.

    Returns:
        str: Path to JSONL file.
    """
    logger.info(f"Fetching Reddit data for {sector}")
    filename = f"/content/reddit_{sector}_large.jsonl"

    if os.path.exists(filename):
        logger.info(f"Using cached Reddit data: {filename}")
        print(f"Using cached Reddit data: {filename}")
        return filename

    companies = sectors[sector]
    all_posts = []
    start_timestamp = datetime.combine(start_date, datetime.min.time()).timestamp()
    end_timestamp = datetime.combine(end_date + timedelta(days=1), datetime.min.time()).timestamp() - 1

    for company in companies:
        for subreddit in subreddits:
            for attempt in range(3):  # Retry up to 3 times
                try:
                    logger.info(f"Attempt {attempt+1}: Fetching {company} from r/{subreddit}")
                    print(f"Fetching {company} from r/{subreddit} (Attempt {attempt+1})...")
                    submissions = reddit.subreddit(subreddit).search(
                        query=company, sort="new", limit=limit_per_query, time_filter="year"
                    )
                    post_count = 0
                    for submission in submissions:
                        if start_timestamp <= submission.created_utc <= end_timestamp:
                            text = submission.title + " " + (submission.selftext or "")
                            if company.lower() in text.lower():
                                post = {
                                    "company": company,
                                    "text": text,
                                    "timestamp": submission.created_utc,
                                    "subreddit": subreddit
                                }
                                all_posts.append(post)
                                post_count += 1
                    logger.info(f"Fetched {post_count} posts for {company} in r/{subreddit}")
                    print(f"Fetched {post_count} posts for {company} in r/{subreddit}")
                    break  # Success, move to next subreddit
                except Exception as e:
                    logger.error(f"Attempt {attempt+1} failed for {company} in {subreddit}: {e}")
                    print(f"Error for {company} in {subreddit}: {e}")
                    if attempt < 2:
                        time.sleep(5)  # Wait before retry
                    continue
                finally:
                    time.sleep(2)  # Rate limit delay

    if not all_posts:
        logger.warning(f"No Reddit posts fetched for {sector}")
        print(f"No Reddit posts fetched for {sector}")
        return filename

    with open(filename, "w") as f:
        for post in all_posts:
            f.write(json.dumps(post) + "\n")

    logger.info(f"Saved {len(all_posts)} posts to {filename}")
    print(f"Saved {len(all_posts)} posts to {filename}")
    return filename

sector = "Technology"
reddit_file = fetch_large_reddit(sector)

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching AAPL from r/wallstreetbets (Attempt 1)...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetched 112 posts for AAPL in r/wallstreetbets


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching AAPL from r/stocks (Attempt 1)...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetched 168 posts for AAPL in r/stocks


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching AAPL from r/investing (Attempt 1)...
Fetched 80 posts for AAPL in r/investing


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching MSFT from r/wallstreetbets (Attempt 1)...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetched 130 posts for MSFT in r/wallstreetbets


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching MSFT from r/stocks (Attempt 1)...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetched 120 posts for MSFT in r/stocks


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching MSFT from r/investing (Attempt 1)...
Fetched 99 posts for MSFT in r/investing


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching GOOGL from r/wallstreetbets (Attempt 1)...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetched 202 posts for GOOGL in r/wallstreetbets


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching GOOGL from r/stocks (Attempt 1)...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetched 247 posts for GOOGL in r/stocks


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching GOOGL from r/investing (Attempt 1)...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetched 241 posts for GOOGL in r/investing


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching TSLA from r/wallstreetbets (Attempt 1)...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetched 146 posts for TSLA in r/wallstreetbets


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching TSLA from r/stocks (Attempt 1)...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetched 205 posts for TSLA in r/stocks


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Fetching TSLA from r/investing (Attempt 1)...
Fetched 74 posts for TSLA in r/investing
Saved 1824 posts to /content/reddit_Technology_large.jsonl


In [4]:
import os
import logging
import yfinance as yf

def fetch_stock_data(sector, cache_dir="/content/data"):
    """Fetch stock prices for sector companies within date range."""
    logger.info(f"Fetching stock data for {sector}")
    os.makedirs(cache_dir, exist_ok=True)
    companies = sectors[sector]

    for company in companies:
        cache_file = f"{cache_dir}/stock_{company}.csv"
        if os.path.exists(cache_file):
            logger.info(f"Using cached stock data for {company}")
            print(f"Using cached stock data for {company}")
            continue
        try:
            print(f"Fetching stock data for {company}...")
            stock = yf.Ticker(company)
            hist = stock.history(start=start_date, end=end_date + timedelta(days=1), interval="1d")
            if hist.empty:
                logger.warning(f"No stock data for {company}")
                print(f"No stock data for {company}")
                continue
            hist = hist.reset_index()[["Date", "Open", "Close", "High", "Low", "Volume"]]
            hist["Date"] = pd.to_datetime(hist["Date"]).dt.date
            hist.to_csv(cache_file, index=False)
            logger.info(f"Saved {len(hist)} days for {company} to {cache_file}")
            print(f"Saved {len(hist)} days for {company}")
        except Exception as e:
            logger.error(f"Error fetching stock data for {company}: {e}")
            print(f"Error for {company}: {e}")
            continue

fetch_stock_data(sector="Technology")

Fetching stock data for AAPL...
Saved 252 days for AAPL
Fetching stock data for MSFT...
Saved 252 days for MSFT
Fetching stock data for GOOGL...
Saved 252 days for GOOGL
Fetching stock data for TSLA...
Saved 252 days for TSLA


In [5]:
from pyflink.table import EnvironmentSettings, TableEnvironment
from pyflink.table.udf import udf
import pandas as pd
import logging
import os
import json

def preprocess_reddit(sector):
    """Process Reddit posts with Flink to compute daily sentiment in batch mode."""
    logger.info(f"Preprocessing Reddit data for {sector}")
    env_settings = EnvironmentSettings.in_batch_mode()
    t_env = TableEnvironment.create(env_settings)

    @udf(result_type="MAP<STRING, STRING>")
    def parse_json(line):
        """Parse JSON string into a map."""
        try:
            data = json.loads(line)
            return {k: str(v) for k, v in data.items()}
        except:
            return {}

    @udf(result_type="FLOAT")
    def get_vader_sentiment(text):
        try:
            return sid.polarity_scores(text or "")["compound"]
        except:
            return 0.0

    @udf(result_type="STRING")
    def to_trading_date(timestamp):
        try:
            date = pd.to_datetime(float(timestamp), unit="s")
            return str(next_trading_day(date))
        except:
            return str(end_date)

    t_env.create_temporary_function("parse_json", parse_json)
    t_env.create_temporary_function("get_vader_sentiment", get_vader_sentiment)
    t_env.create_temporary_function("to_trading_date", to_trading_date)

    reddit_file = f"/content/reddit_{sector}_large.jsonl"
    if not os.path.exists(reddit_file):
        logger.warning(f"No Reddit data for {sector}")
        print(f"No Reddit data for {sector}")
        return

    t_env.execute_sql("""
        CREATE TABLE reddit_source (
            line STRING
        ) WITH (
            'connector' = 'filesystem',
            'path' = 'file://%s',
            'format' = 'raw'
        )
    """ % reddit_file)

    t_env.execute_sql("""
        CREATE TEMPORARY VIEW parsed_reddit AS
        SELECT
            parsed['company'] AS company,
            to_trading_date(parsed['timestamp']) AS trading_date,
            get_vader_sentiment(parsed['text']) AS sentiment_score
        FROM (
            SELECT parse_json(line) AS parsed
            FROM reddit_source
            WHERE line IS NOT NULL
        ) t
        WHERE parsed['company'] IS NOT NULL AND parsed['timestamp'] IS NOT NULL
    """)

    agg_table = t_env.sql_query("""
        SELECT
            company,
            trading_date,
            AVG(sentiment_score) AS avg_sentiment,
            COUNT(*) AS post_count
        FROM parsed_reddit
        GROUP BY company, trading_date
    """)

    sink_file = f"/content/daily_sentiment_{sector}.csv"
    t_env.execute_sql("""
        CREATE TABLE sentiment_sink (
            company STRING,
            trading_date STRING,
            avg_sentiment FLOAT,
            post_count BIGINT
        ) WITH (
            'connector' = 'filesystem',
            'path' = 'file://%s',
            'format' = 'csv'
        )
    """ % sink_file)

    agg_table.execute_insert("sentiment_sink").wait()
    logger.info(f"Processed Reddit posts to {sink_file}")
    print(f"Processed Reddit posts for {sector}")

def preprocess_stock(sector):
    """Process stock data with Flink for trends."""
    logger.info(f"Preprocessing stock data for {sector}")
    env_settings = EnvironmentSettings.in_batch_mode()
    t_env = TableEnvironment.create(env_settings)

    companies = sectors[sector]
    for company in companies:
        cache_file = f"/content/data/stock_{company}.csv"
        if not os.path.exists(cache_file):
            logger.warning(f"Stock data missing for {company}")
            print(f"Stock data missing for {company}")
            continue

        source_table = f"stock_source_{company}"
        t_env.execute_sql("""
            CREATE TABLE %s (
                `Date` STRING,
                `Open` DOUBLE,
                `Close` DOUBLE,
                `High` DOUBLE,
                `Low` DOUBLE,
                `Volume` BIGINT
            ) WITH (
                'connector' = 'filesystem',
                'path' = 'file://%s',
                'format' = 'csv',
                'csv.ignore-parse-errors' = 'true'
            )
        """ % (source_table, cache_file))

        table = t_env.sql_query("""
            SELECT
                `Date` AS `date`,
                `Close` AS `close`,
                CASE
                    WHEN LEAD(`Close`) OVER (ORDER BY `Date`) > `Close` THEN 1
                    ELSE 0
                END AS `trend`
            FROM %s
            WHERE `Date` IS NOT NULL
        """ % source_table)

        sink_file = f"/content/processed_stock_{company}.csv"
        sink_table = f"stock_sink_{company}"
        t_env.execute_sql("""
            CREATE TABLE %s (
                `date` STRING,
                `close` DOUBLE,
                `trend` INT
            ) WITH (
                'connector' = 'filesystem',
                'path' = 'file://%s',
                'format' = 'csv'
            )
        """ % (sink_table, sink_file))

        table.execute_insert(sink_table).wait()
        logger.info(f"Processed stock data to {sink_file}")
        print(f"Processed stock data for {company}")

preprocess_reddit(sector="Technology")
preprocess_stock(sector="Technology")

Processed Reddit posts for Technology
Processed stock data for AAPL
Processed stock data for MSFT
Processed stock data for GOOGL
Processed stock data for TSLA


In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import logging
import os
import glob

# Assuming logger, sectors from Segment 2
logger.info("Starting LSTM data preparation")

def prepare_lstm_data(sector, sequence_length=60):
    """Prepare sequences for LSTM from sentiment and stock data."""
    logger.info(f"Preparing LSTM data for {sector}")

    # Find sentiment part file
    sentiment_dir = f"/content/daily_sentiment_{sector}.csv"
    sentiment_files = glob.glob(f"{sentiment_dir}/part-*")
    if not sentiment_files:
        logger.error(f"No part files found in {sentiment_dir}")
        print(f"No part files found in {sentiment_dir}")
        return []

    sentiment_part_file = sentiment_files[0]  # Take first part file
    if not os.path.isfile(sentiment_part_file):
        logger.error(f"Sentiment part file is not a file: {sentiment_part_file}")
        print(f"Sentiment part file is not a file: {sentiment_part_file}")
        return []

    # Read sentiment data with explicit column names
    try:
        sentiment_df = pd.read_csv(
            sentiment_part_file,
            names=['company', 'trading_date', 'avg_sentiment', 'post_count'],
            header=None
        )
        logger.info(f"Loaded sentiment data: {len(sentiment_df)} rows")
        print(f"Loaded sentiment data: {len(sentiment_df)} rows")
        print(f"Sentiment columns: {list(sentiment_df.columns)}")
        print("Sentiment sample:")
        print(sentiment_df.head().to_string())
        unique_companies = sentiment_df['company'].unique()
        print(f"Unique companies: {unique_companies}")
    except Exception as e:
        logger.error(f"Failed to read sentiment part file {sentiment_part_file}: {e}")
        print(f"Failed to read sentiment part file: {e}")
        return []

    # Validate sentiment columns
    required_cols = ['company', 'trading_date', 'avg_sentiment', 'post_count']
    missing_cols = [col for col in required_cols if col not in sentiment_df.columns]
    if missing_cols:
        logger.error(f"Missing required columns in sentiment data: {missing_cols}")
        print(f"Missing required columns in sentiment data: {missing_cols}")
        return []

    companies = sectors[sector]
    sequences = []
    scaler = MinMaxScaler()

    for company in companies:
        # Find stock part file
        stock_dir = f"/content/processed_stock_{company}.csv"
        stock_files = glob.glob(f"{stock_dir}/part-*")
        if not stock_files:
            logger.warning(f"No part files found in {stock_dir}")
            print(f"No part files found for {company}")
            continue

        stock_part_file = stock_files[0]  # Take first part file
        if not os.path.isfile(stock_part_file):
            logger.warning(f"Stock part file is not a file: {stock_part_file}")
            print(f"Stock part file is not a file for {company}")
            continue

        # Read stock data with explicit column names
        try:
            stock_df = pd.read_csv(
                stock_part_file,
                names=['date', 'close', 'trend'],
                header=None
            )
            logger.info(f"Loaded stock data for {company}: {len(stock_df)} rows")
            print(f"Loaded stock data for {company}: {len(stock_df)} rows")
            print(f"Stock columns for {company}: {list(stock_df.columns)}")
            print(f"Stock sample for {company}:")
            print(stock_df.head().to_string())
        except Exception as e:
            logger.error(f"Failed to read stock part file {stock_part_file}: {e}")
            print(f"Failed to read stock part file for {company}: {e}")
            continue

        # Validate stock columns
        required_stock_cols = ['date', 'close', 'trend']
        missing_stock_cols = [col for col in required_stock_cols if col not in stock_df.columns]
        if missing_stock_cols:
            logger.error(f"Missing required columns in stock data for {company}: {missing_stock_cols}")
            print(f"Missing required columns in stock data for {company}: {missing_stock_cols}")
            continue

        # Merge data
        try:
            company_sentiment = sentiment_df[sentiment_df['company'] == company][
                ['trading_date', 'avg_sentiment', 'post_count']
            ]
            merged_df = stock_df.merge(
                company_sentiment,
                left_on='date',
                right_on='trading_date',
                how='left'
            )
            merged_df['avg_sentiment'] = merged_df['avg_sentiment'].fillna(0.0)
            merged_df['post_count'] = merged_df['post_count'].fillna(0)
            merged_df = merged_df.drop(columns=['trading_date'], errors='ignore')
            logger.info(f"Merged data for {company}: {len(merged_df)} rows")
            print(f"Merged data for {company}: {len(merged_df)} rows")
        except Exception as e:
            logger.error(f"Failed to merge data for {company}: {e}")
            print(f"Failed to merge data for {company}: {e}")
            continue

        # Prepare features and target
        try:
            features = merged_df[['close', 'avg_sentiment', 'post_count']].values
            target = merged_df['trend'].values

            if len(features) > 0:
                scaled_features = scaler.fit_transform(features)
            else:
                logger.warning(f"No features for {company}")
                print(f"No features for {company}")
                continue

            X, y = [], []
            for i in range(len(scaled_features) - sequence_length):
                X.append(scaled_features[i:i + sequence_length])
                y.append(target[i + sequence_length])

            if X:
                sequences.append((company, np.array(X), np.array(y)))
                logger.info(f"Created {len(X)} sequences for {company}")
                print(f"Created {len(X)} sequences for {company}")
            else:
                logger.warning(f"No sequences for {company}")
                print(f"No sequences for {company}")
        except Exception as e:
            logger.error(f"Failed to create sequences for {company}: {e}")
            print(f"Failed to create sequences for {company}: {e}")

    total_sequences = sum(len(X) for _, X, _ in sequences)
    logger.info(f"Total sequences prepared: {total_sequences}")
    print(f"Total sequences prepared: {total_sequences}")
    return sequences

# Run for Technology sector
lstm_data = prepare_lstm_data("Technology")

Loaded sentiment data: 713 rows
Sentiment columns: ['company', 'trading_date', 'avg_sentiment', 'post_count']
Sentiment sample:
  company trading_date  avg_sentiment  post_count
0    AAPL   2025-04-14       0.264200           3
1    AAPL   2025-04-07       0.453400           6
2    AAPL   2025-04-04       0.936633           3
3    AAPL   2025-03-03       0.299175           4
4    AAPL   2025-02-27       0.458400           2
Unique companies: ['AAPL' 'MSFT' 'GOOGL' 'TSLA']
Loaded stock data for AAPL: 253 rows
Stock columns for AAPL: ['date', 'close', 'trend']
Stock sample for AAPL:
         date       close  trend
0  2024-04-16  168.583984      0
1  2024-04-17  167.210464      0
2  2024-04-18  166.254959      0
3  2024-04-19  164.224564      1
4  2024-04-22  165.060593      1
Merged data for AAPL: 253 rows
Created 193 sequences for AAPL
Loaded stock data for MSFT: 253 rows
Stock columns for MSFT: ['date', 'close', 'trend']
Stock sample for MSFT:
         date       close  trend
0  2024-

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import numpy as np
import logging
import os

# Assuming logger from Segment 2
logger.info("Starting LSTM training")

def train_lstm_models(lstm_data, epochs=10, batch_size=32):
    """Train LSTM models for each company."""
    models = {}

    for company, X, y in lstm_data:
        logger.info(f"Training LSTM for {company}")
        print(f"Training LSTM for {company}")

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        logger.info(f"{company} - Train: {X_train.shape}, Test: {X_test.shape}")
        print(f"{company} - Train: {X_train.shape}, Test: {X_test.shape}")

        # Build LSTM model
        model = Sequential([
            LSTM(50, return_sequences=True, input_shape=(X.shape[1], X.shape[2])),
            Dropout(0.2),
            LSTM(50),
            Dropout(0.2),
            Dense(25, activation='relu'),
            Dense(1, activation='sigmoid')
        ])

        # Compile model
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Train model
        history = model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_test, y_test),
            verbose=1
        )

        # Evaluate model
        loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
        logger.info(f"{company} - Test accuracy: {accuracy:.4f}")
        print(f"{company} - Test accuracy: {accuracy:.4f}")

        # Save model
        model_path = f"/content/model_{company}.h5"
        model.save(model_path)
        logger.info(f"Saved model for {company} to {model_path}")
        print(f"Saved model for {company} to {model_path}")

        models[company] = model

    return models

# Train models using Segment 6 output
models = train_lstm_models(lstm_data)

Training LSTM for AAPL
AAPL - Train: (154, 60, 3), Test: (39, 60, 3)


  super().__init__(**kwargs)


Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 442ms/step - accuracy: 0.4033 - loss: 0.7013 - val_accuracy: 0.5897 - val_loss: 0.6875
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 75ms/step - accuracy: 0.5052 - loss: 0.6935 - val_accuracy: 0.5897 - val_loss: 0.6876
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 72ms/step - accuracy: 0.5234 - loss: 0.6942 - val_accuracy: 0.5897 - val_loss: 0.6864
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.5521 - loss: 0.6901 - val_accuracy: 0.5897 - val_loss: 0.6843
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 69ms/step - accuracy: 0.5429 - loss: 0.6888 - val_accuracy: 0.5897 - val_loss: 0.6842
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.5464 - loss: 0.6890 - val_accuracy: 0.5897 - val_loss: 0.6850
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━

In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
import logging
from datetime import datetime, timedelta

# Assuming logger, sectors from Segment 2
logger.info("Starting trend predictions")

def predict_trends(lstm_data, models, forecast_days=10):
    """Predict stock trends using trained LSTM models."""
    predictions = []

    for company, X, _ in lstm_data:
        logger.info(f"Predicting trends for {company}")
        print(f"Predicting trends for {company}")

        if company not in models:
            logger.warning(f"No model found for {company}")
            print(f"No model found for {company}")
            continue

        model = models[company]
        # Use latest 60-day sequence for prediction
        latest_sequence = X[-1:]  # Shape: [1, 60, 3]

        # Predict for next forecast_days
        company_predictions = []
        current_sequence = latest_sequence.copy()
        last_date = datetime.strptime("2025-04-16", "%Y-%m-%d")  # Latest stock date

        for day in range(forecast_days):
            # Predict probability
            pred_prob = model.predict(current_sequence, verbose=0)[0][0]
            trend = 1 if pred_prob >= 0.5 else 0  # Threshold at 0.5
            pred_date = last_date + timedelta(days=day + 1)

            company_predictions.append({
                'company': company,
                'date': pred_date.strftime("%Y-%m-%d"),
                'trend': trend,
                'probability': float(pred_prob)
            })

            # Update sequence (shift and append dummy data for demo)
            # In practice, use actual new data if available
            new_data = current_sequence[:, -1, :].copy()
            new_data[:, 0] = new_data[:, 0]  # Keep last close (dummy)
            new_data[:, 1] = 0.0  # Neutral sentiment
            new_data[:, 2] = 0  # No posts
            current_sequence = np.append(current_sequence[:, 1:, :], [new_data], axis=1)

        predictions.extend(company_predictions)
        logger.info(f"Predicted {len(company_predictions)} trends for {company}")
        print(f"Predicted {len(company_predictions)} trends for {company}")

    # Save predictions
    predictions_df = pd.DataFrame(predictions)
    output_path = "/content/predictions.csv"
    predictions_df.to_csv(output_path, index=False)
    logger.info(f"Saved predictions to {output_path}")
    print(f"Saved predictions to {output_path}")

    return predictions_df

# Load models
models = {}
for company in ['AAPL', 'MSFT', 'GOOGL', 'TSLA']:
    model_path = f"/content/model_{company}.h5"
    if os.path.exists(model_path):
        models[company] = tf.keras.models.load_model(model_path)
        print(f"Loaded model for {company}")
    else:
        print(f"Model not found for {company}")

# Predict trends using Segment 6's lstm_data
predictions_df = predict_trends(lstm_data, models)

Loaded model for AAPL
Loaded model for MSFT
Loaded model for GOOGL
Loaded model for TSLA
Predicting trends for AAPL
Predicted 10 trends for AAPL
Predicting trends for MSFT
Predicted 10 trends for MSFT
Predicting trends for GOOGL
Predicted 10 trends for GOOGL
Predicting trends for TSLA
Predicted 10 trends for TSLA
Saved predictions to /content/predictions.csv


In [14]:
import pandas as pd
import logging

# Assuming logger from Segment 2
logger.info("Starting alert generation")

def generate_alerts(predictions_file="/content/predictions.csv"):
    """Generate alerts based on prediction probabilities."""
    try:
        predictions_df = pd.read_csv(predictions_file)
        logger.info(f"Loaded predictions: {len(predictions_df)} rows")
        print(f"Loaded predictions: {len(predictions_df)} rows")
    except Exception as e:
        logger.error(f"Failed to read predictions: {e}")
        print(f"Failed to read predictions: {e}")
        return pd.DataFrame()

    alerts = []
    for _, row in predictions_df.iterrows():
        company = row['company']
        date = row['date']
        probability = row['probability']
        trend = row['trend']

        # Alert thresholds
        if probability >= 0.7 and trend == 1:
            alert_type = "Strong Buy"
        elif probability <= 0.3 and trend == 0:
            alert_type = "Strong Sell"
        else:
            continue

        alerts.append({
            'company': company,
            'date': date,
            'alert_type': alert_type,
            'probability': probability
        })

    alerts_df = pd.DataFrame(alerts)
    output_path = "/content/alerts.csv"
    alerts_df.to_csv(output_path, index=False)
    logger.info(f"Generated {len(alerts_df)} alerts, saved to {output_path}")
    print(f"Generated {len(alerts_df)} alerts, saved to {output_path}")

    return alerts_df

# Generate alerts
alerts_df = generate_alerts()

Loaded predictions: 40 rows
Generated 0 alerts, saved to /content/alerts.csv


In [16]:
!pip install streamlit==1.43.2 pyngrok plotly



In [17]:
from pyngrok import ngrok
ngrok.set_auth_token("2tXLBQHCEqsQS0TYDRNHF5w8fHk_2rbtDMMHkdDsLPdss1hUP")



In [18]:
with open("/content/dashboard.py", "w") as f:
    f.write('''<import streamlit as st
import pandas as pd
import plotly.express as px
import logging

# Assuming logger from Segment 2
logger.info("Starting Streamlit dashboard")

# Streamlit app
st.title("Stock Sentiment Dashboard")

# Load data
@st.cache_data
def load_data():
    try:
        predictions = pd.read_csv("/content/predictions.csv")
        alerts = pd.read_csv("/content/alerts.csv")
        return predictions, alerts
    except Exception as e:
        logger.error(f"Failed to load data: {e}")
        st.error(f"Failed to load data: {e}")
        return None, None

predictions, alerts = load_data()

if predictions is not None and alerts is not None:
    # Sidebar for company selection
    companies = predictions['company'].unique()
    selected_company = st.sidebar.selectbox("Select Company", companies)

    # Predictions table
    st.subheader(f"Predictions for {selected_company}")
    company_predictions = predictions[predictions['company'] == selected_company]
    st.dataframe(company_predictions[['date', 'trend', 'probability']])

    # Predictions plot
    fig = px.line(
        company_predictions,
        x='date',
        y='probability',
        title=f"Trend Probability for {selected_company}",
        labels={'probability': 'Trend Probability', 'date': 'Date'}
    )
    st.plotly_chart(fig)

    # Alerts table
    st.subheader(f"Alerts for {selected_company}")
    company_alerts = alerts[alerts['company'] == selected_company]
    if not company_alerts.empty:
        st.dataframe(company_alerts[['date', 'alert_type', 'probability']])
    else:
        st.write("No alerts for this company.")
else:
    st.write("Data loading failed. Check logs.")>''')

In [20]:
from pyngrok import ngrok
import subprocess
import time

# Start Streamlit server
streamlit_proc = subprocess.Popen(["streamlit", "run", "/content/dashboard.py", "--server.port", "8501"])
time.sleep(5)  # Wait for Streamlit to start

# Create ngrok tunnel
public_url = ngrok.connect(8501)
print(f"Streamlit dashboard running at: {public_url}")

Streamlit dashboard running at: NgrokTunnel: "https://4f32-34-55-42-235.ngrok-free.app" -> "http://localhost:8501"
