# Pickrel-Smith_DSC680_Project3_Code

## Main Script Run

### Import Necessary Libraries

In [None]:
import os
import json
import hashlib
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import time
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import yfinance as yf
import exchange_calendars as xcals
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

### Data Collection Functions

#### Reddit Data Collection

##### fetch_reddit_data

In [None]:
def fetch_reddit_data(subreddit, start_timestamp, end_timestamp, limit=100):
    """
    Fetch posts from a subreddit within a specific time range using Pushshift API
    
    Parameters:
    -----------
    subreddit : str
        Name of the subreddit
    start_timestamp : int
        Unix timestamp for start date
    end_timestamp : int
        Unix timestamp for end date
    limit : int
        Maximum number of posts to retrieve per request
        
    Returns:
    --------
    list
        List of posts as dictionaries
    """
    base_url = "https://api.pushshift.io/reddit/search/submission"
    
    params = {
        "subreddit": subreddit,
        "after": start_timestamp,
        "before": end_timestamp,
        "size": limit,
        "sort": "asc",
        "sort_type": "created_utc"
    }
    
    try:
        response = requests.get(base_url, params=params)
        data = response.json()
        return data.get("data", [])
    except Exception as e:
        print(f"Error fetching Reddit data: {e}")
        return []

##### fetch_reddit_comments

In [None]:
def fetch_reddit_comments(submission_id, limit=100):
    """
    Fetch comments for a specific Reddit submission using Pushshift API
    
    Parameters:
    -----------
    submission_id : str
        Reddit submission ID
    limit : int
        Maximum number of comments to retrieve
        
    Returns:
    --------
    list
        List of comments as dictionaries
    """
    base_url = "https://api.pushshift.io/reddit/search/comment"
    
    params = {
        "link_id": submission_id,
        "size": limit,
        "sort": "asc",
        "sort_type": "created_utc"
    }
    
    try:
        response = requests.get(base_url, params=params)
        data = response.json()
        return data.get("data", [])
    except Exception as e:
        print(f"Error fetching Reddit comments: {e}")
        return []

##### save_to_parquet

In [None]:
def save_to_parquet(data, filename):
    """
    Save data to Parquet format
    
    Parameters:
    -----------
    data : list or pandas.DataFrame
        Data to save
    filename : str
        Output filename
    """
    if isinstance(data, list):
        df = pd.DataFrame(data)
    else:
        df = data
    
    df.to_parquet(filename, index=False)
    print(f"Saved {len(df)} records to {filename}")

##### hash_usernames

In [None]:
def hash_username(username):
    """
    Hash a username to protect privacy
    
    Parameters:
    -----------
    username : str
        Username to hash
        
    Returns:
    --------
    str
        Hashed username
    """
    if pd.isna(username) or username is None:
        return None
    
    return hashlib.sha256(str(username).encode()).hexdigest()

#### Stock Data Collection

##### fetch_stock_data

In [None]:
def fetch_stock_data(ticker, start_date, end_date):
    """
    Fetch historical stock data from Yahoo Finance
    
    Parameters:
    -----------
    ticker : str
        Stock ticker symbol
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing historical stock data
    """
    try:
        stock_data = yf.download(ticker, start=start_date, end=end_date)
        return stock_data
    except Exception as e:
        print(f"Error fetching stock data: {e}")
        return pd.DataFrame()

##### get_trading_days

In [None]:
def get_trading_days(start_date, end_date):
    """
    Get list of NYSE trading days between start_date and end_date
    
    Parameters:
    -----------
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format
        
    Returns:
    --------
    pandas.DatetimeIndex
        DatetimeIndex containing trading days
    """
    nyse = xcals.get_calendar('NYSE')
    trading_days = nyse.sessions_in_range(
        pd.Timestamp(start_date),
        pd.Timestamp(end_date)
    )
    return trading_days

### Sentiment Analysis Functions

##### analyze_sentiment

In [None]:
def analyze_sentiment(text):
    """
    Analyze sentiment of text using VADER
    
    Parameters:
    -----------
    text : str
        Text to analyze
        
    Returns:
    --------
    dict
        Dictionary containing sentiment scores
    """
    if pd.isna(text) or text is None:
        return {
            'compound': 0,
            'pos': 0,
            'neu': 0,
            'neg': 0
        }
    
    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)

##### process_reddit_data

In [None]:
def process_reddit_data(df):
    """
    Process Reddit data: clean and add sentiment scores
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing Reddit posts or comments
        
    Returns:
    --------
    pandas.DataFrame
        Processed DataFrame with sentiment scores
    """
    # Convert timestamps to datetime
    if 'created_utc' in df.columns:
        df['created_at'] = pd.to_datetime(df['created_utc'], unit='s')
    
    # Hash usernames for privacy
    if 'author' in df.columns:
        df['author_hashed'] = df['author'].apply(hash_username)
    
    # Analyze sentiment for titles and text/body
    sentiment_columns = []
    
    if 'title' in df.columns:
        df['title_sentiment'] = df['title'].apply(analyze_sentiment)
        df['title_compound'] = df['title_sentiment'].apply(lambda x: x['compound'])
        sentiment_columns.append('title_compound')
    
    text_col = next((col for col in ['selftext', 'body'] if col in df.columns), None)
    if text_col:
        df[f'{text_col}_sentiment'] = df[text_col].apply(analyze_sentiment)
        df[f'{text_col}_compound'] = df[f'{text_col}_sentiment'].apply(lambda x: x['compound'])
        sentiment_columns.append(f'{text_col}_compound')
    
    # Calculate overall sentiment if multiple sentiment columns exist
    if len(sentiment_columns) > 0:
        df['compound_sentiment'] = df[sentiment_columns].mean(axis=1)
    
    return df

##### aggregate_daily_sentiment

In [None]:
def aggregate_daily_sentiment(df, date_column='created_at'):
    """
    Aggregate sentiment scores by day
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing sentiment scores
    date_column : str
        Column containing dates
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame with daily aggregated sentiment
    """
    # Ensure date column is datetime
    df[date_column] = pd.to_datetime(df[date_column])
    
    # Group by date and calculate statistics
    daily_sentiment = df.groupby(df[date_column].dt.date).agg({
        'compound_sentiment': ['mean', 'median', 'std', 'count'],
        'title_compound': ['mean', 'median', 'std'] if 'title_compound' in df.columns else [],
        'selftext_compound': ['mean', 'median', 'std'] if 'selftext_compound' in df.columns else [],
        'body_compound': ['mean', 'median', 'std'] if 'body_compound' in df.columns else []
    })
    
    # Flatten multi-index columns
    daily_sentiment.columns = ['_'.join(col).strip() for col in daily_sentiment.columns.values]
    
    # Reset index to make date a column
    daily_sentiment = daily_sentiment.reset_index()
    daily_sentiment.rename(columns={date_column: 'date'}, inplace=True)
    
    return daily_sentiment

### Feature Engineering Functions

##### prepare_stock_features

In [None]:
def prepare_stock_features(stock_data):
    """
    Prepare stock features for modeling
    
    Parameters:
    -----------
    stock_data : pandas.DataFrame
        DataFrame containing stock data
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame with engineered features
    """
    # Calculate daily returns
    stock_data['daily_return'] = stock_data['Adj Close'].pct_change()
    
    # Calculate 5-day moving average
    stock_data['ma5'] = stock_data['Adj Close'].rolling(window=5).mean()
    
    # Create moving average crossover flag
    stock_data['ma_crossover'] = (stock_data['Adj Close'] > stock_data['ma5']).astype(int)
    
    # Create target variable: price direction (up, down, flat)
    # Define 'flat' as daily return between -0.2% and 0.2%
    def categorize_return(ret):
        if pd.isna(ret):
            return None
        elif ret > 0.002:
            return 'up'
        elif ret < -0.002:
            return 'down'
        else:
            return 'flat'
    
    stock_data['price_direction'] = stock_data['daily_return'].apply(categorize_return)
    
    # Create lagged features
    stock_data['prev_return'] = stock_data['daily_return'].shift(1)
    stock_data['prev_direction'] = stock_data['price_direction'].shift(1)
    
    return stock_data

##### merge_sentiment_stock_data

In [None]:
def merge_sentiment_stock_data(sentiment_data, stock_data, trading_days):
    """
    Merge sentiment data with stock data, aligning by trading days
    
    Parameters:
    -----------
    sentiment_data : pandas.DataFrame
        DataFrame containing sentiment data
    stock_data : pandas.DataFrame
        DataFrame containing stock data
    trading_days : pandas.DatetimeIndex
        DatetimeIndex containing trading days
        
    Returns:
    --------
    pandas.DataFrame
        Merged DataFrame
    """
    # Convert date columns to datetime
    sentiment_data['date'] = pd.to_datetime(sentiment_data['date'])
    
    # Create a mapping from calendar days to next trading days
    calendar_to_trading = {}
    trading_days_list = sorted(trading_days.date)
    
    for i, day in enumerate(trading_days_list[:-1]):
        # Map each calendar day to the next trading day
        calendar_to_trading[day] = trading_days_list[i+1]
    
    # Add a column with the next trading day
    sentiment_data['next_trading_day'] = sentiment_data['date'].apply(
        lambda x: calendar_to_trading.get(x.date(), None)
    )
    
    # Drop rows where next_trading_day is None
    sentiment_data = sentiment_data.dropna(subset=['next_trading_day'])
    
    # Convert next_trading_day to datetime for merging
    sentiment_data['next_trading_day'] = pd.to_datetime(sentiment_data['next_trading_day'])
    
    # Reset index of stock_data to make Date a column
    stock_data = stock_data.reset_index()
    
    # Merge sentiment data with stock data
    merged_data = pd.merge(
        sentiment_data,
        stock_data,
        left_on='next_trading_day',
        right_on='Date',
        how='inner'
    )
    
    return merged_data

### Model Training and Evaluation

##### train_evaluate_model

In [None]:
def train_evaluate_model(data, features, target, n_splits=5):
    """
    Train and evaluate a multinomial logistic regression model
    
    Parameters:
    -----------
    data : pandas.DataFrame
        DataFrame containing features and target
    features : list
        List of feature column names
    target : str
        Target column name
    n_splits : int
        Number of splits for time series cross-validation
        
    Returns:
    --------
    tuple
        (model, evaluation metrics)
    """
    # Drop rows with missing values
    data = data.dropna(subset=features + [target])
    
    # Prepare features and target
    X = data[features]
    y = data[target]
    
    # Handle class imbalance
    class_weights = {
        'up': len(y) / (3 * (y == 'up').sum()),
        'down': len(y) / (3 * (y == 'down').sum()),
        'flat': len(y) / (3 * (y == 'flat').sum())
    }
    
    # Initialize model
    model = LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        class_weight=class_weights,
        max_iter=1000,
        random_state=42
    )
    
    # Perform time series cross-validation
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    cv_results = {
        'accuracy': [],
        'balanced_accuracy': [],
        'macro_f1': []
    }
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        report = classification_report(y_test, y_pred, output_dict=True)
        cv_results['accuracy'].append(report['accuracy'])
        cv_results['balanced_accuracy'].append(balanced_accuracy_score(y_test, y_pred))
        cv_results['macro_f1'].append(report['macro avg']['f1-score'])
    
    # Train final model on all data
    model.fit(X, y)
    
    # Calculate average metrics
    avg_metrics = {
        'accuracy': np.mean(cv_results['accuracy']),
        'balanced_accuracy': np.mean(cv_results['balanced_accuracy']),
        'macro_f1': np.mean(cv_results['macro_f1'])
    }
    
    return model, avg_metrics

##### compare_with_baseline

In [None]:
def compare_with_baseline(data, target):
    """
    Compare model performance with a naive baseline
    
    Parameters:
    -----------
    data : pandas.DataFrame
        DataFrame containing the target
    target : str
        Target column name
        
    Returns:
    --------
    dict
        Baseline metrics
    """
    # Naive baseline: always predict 'flat'
    y_true = data[target].dropna()
    y_pred = pd.Series(['flat'] * len(y_true), index=y_true.index)
    
    # Calculate metrics
    report = classification_report(y_true, y_pred, output_dict=True)
    
    baseline_metrics = {
        'accuracy': report['accuracy'],
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'macro_f1': report['macro avg']['f1-score']
    }
    
    return baseline_metrics

### Main Execution Flow

In [None]:
# Define parameters
subreddits = ['AMD', 'StockMarket']
start_date = '2018-01-01'
end_date = '2025-05-01'  # Current date
ticker = 'AMD'
data_dir = 'data'

# Create data directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)

# Convert dates to timestamps
start_timestamp = int(datetime.datetime.strptime(start_date, '%Y-%m-%d').timestamp())
end_timestamp = int(datetime.datetime.strptime(end_date, '%Y-%m-%d').timestamp())

# Fetch Reddit data for each subreddit
for subreddit in subreddits:
    print(f"Fetching posts from r/{subreddit}...")
    
    # Initialize an empty list to store all posts
    all_posts = []
    all_comments = []
    
    # Fetch posts in chunks
    current_timestamp = start_timestamp
    
    while current_timestamp < end_timestamp:
        # Calculate end of chunk (30 days later)
        chunk_end = min(current_timestamp + 30*24*60*60, end_timestamp)
        
        # Fetch posts for this chunk
        posts = fetch_reddit_data(subreddit, current_timestamp, chunk_end)
        
        if posts:
            all_posts.extend(posts)
            
            # Fetch comments for each post
            for post in tqdm(posts, desc=f"Fetching comments for r/{subreddit} posts"):
                if 'id' in post:
                    comments = fetch_reddit_comments(f"t3_{post['id']}")
                    all_comments.extend(comments)
            
            # Save intermediate results
            save_to_parquet(all_posts, f"{data_dir}/{subreddit}_posts.parquet")
            save_to_parquet(all_comments, f"{data_dir}/{subreddit}_comments.parquet")
        
        # Move to next chunk
        current_timestamp = chunk_end
        
        # Sleep to avoid hitting API rate limits
        time.sleep(1)
    
    print(f"Collected {len(all_posts)} posts and {len(all_comments)} comments from r/{subreddit}")

# Fetch stock data
print(f"Fetching stock data for {ticker}...")
stock_data = fetch_stock_data(ticker, start_date, end_date)
stock_data.to_parquet(f"{data_dir}/{ticker}_stock_data.parquet")

# Get trading days
trading_days = get_trading_days(start_date, end_date)

# Process Reddit data
all_sentiment_data = []

for subreddit in subreddits:
    # Load posts
    posts_df = pd.read_parquet(f"{data_dir}/{subreddit}_posts.parquet")
    
    # Process posts
    processed_posts = process_reddit_data(posts_df)
    
    # Load comments
    comments_df = pd.read_parquet(f"{data_dir}/{subreddit}_comments.parquet")
    
    # Process comments
    processed_comments = process_reddit_data(comments_df)
    
    # Aggregate daily sentiment
    daily_posts_sentiment = aggregate_daily_sentiment(processed_posts)
    daily_comments_sentiment = aggregate_daily_sentiment(processed_comments)
    
    # Add subreddit column
    daily_posts_sentiment['subreddit'] = subreddit
    daily_comments_sentiment['subreddit'] = subreddit
    
    # Add data type column
    daily_posts_sentiment['data_type'] = 'posts'
    daily_comments_sentiment['data_type'] = 'comments'
    
    # Append to all sentiment data
    all_sentiment_data.append(daily_posts_sentiment)
    all_sentiment_data.append(daily_comments_sentiment)

# Combine all sentiment data
combined_sentiment = pd.concat(all_sentiment_data, ignore_index=True)

# Save combined sentiment data
combined_sentiment.to_parquet(f"{data_dir}/combined_sentiment.parquet")

# Prepare stock features
stock_data = prepare_stock_features(stock_data)

# Merge sentiment with stock data
merged_data = merge_sentiment_stock_data(combined_sentiment, stock_data, trading_days)

# Save merged data
merged_data.to_csv(f"{data_dir}/merged_data.csv", index=False)

# Define features for modeling
sentiment_features = [
    'compound_sentiment_mean', 
    'compound_sentiment_std',
    'compound_sentiment_count'
]

stock_features = [
    'prev_return',
    'ma_crossover'
]

all_features = sentiment_features + stock_features
target = 'price_direction'

# Train and evaluate model
model, model_metrics = train_evaluate_model(merged_data, all_features, target)

# Compare with baseline
baseline_metrics = compare_with_baseline(merged_data, target)

# Print results
print("Model Performance:")
for metric, value in model_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nBaseline Performance:")
for metric, value in baseline_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save model
import pickle
with open(f"{data_dir}/sentiment_model.pkl", "wb") as f:
    pickle.dump(model, f)

print(f"Model saved to {data_dir}/sentiment_model.pkl")

### Streamlit Dashboard Code

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import yfinance as yf
from datetime import datetime, timedelta

In [None]:
# Load model
@st.cache_resource
def load_model():
    with open("data/sentiment_model.pkl", "rb") as f:
        model = pickle.load(f)
    return model

# Load data
@st.cache_data
def load_data():
    return pd.read_csv("data/merged_data.csv")

# Fetch recent sentiment data
@st.cache_data
def get_recent_sentiment():
    # In a real application, this would fetch new data from Reddit
    # For this demo, we'll use the most recent data from our dataset
    data = load_data()
    return data.sort_values('date', ascending=False).head(7)

# Fetch recent stock data
@st.cache_data
def get_recent_stock_data():
    end_date = datetime.now()
    start_date = end_date - timedelta(days=30)
    return yf.download('AMD', start=start_date, end=end_date)

In [None]:
# Make prediction
def predict_next_day(model, sentiment_data, stock_data):
    # Prepare features
    features = {
        'compound_sentiment_mean': sentiment_data['compound_sentiment_mean'].iloc[0],
        'compound_sentiment_std': sentiment_data['compound_sentiment_std'].iloc[0],
        'compound_sentiment_count': sentiment_data['compound_sentiment_count'].iloc[0],
        'prev_return': stock_data['Adj Close'].pct_change().iloc[-1],
        'ma_crossover': (stock_data['Adj Close'].iloc[-1] > 
                         stock_data['Adj Close'].rolling(window=5).mean().iloc[-1]).astype(int)
    }
    
    # Convert to DataFrame
    features_df = pd.DataFrame([features])
    
    # Make prediction
    prediction = model.predict(features_df)[0]
    probabilities = model.predict_proba(features_df)[0]
    
    return prediction, probabilities

### Dashboard Setup and Data Loading

In [None]:
# Set up the dashboard title
st.title("Reddit Sentiment & AMD Stock Prediction")

# Load data and model
model = load_model()
data = load_data()
recent_sentiment = get_recent_sentiment()
recent_stock = get_recent_stock_data()

# Make prediction
prediction, probabilities = predict_next_day(model, recent_sentiment, recent_stock)

### Prediction Display

In [None]:
# Display prediction
st.header("Next Day Prediction")

col1, col2, col3 = st.columns(3)

with col1:
    st.metric("Prediction", prediction.upper())

with col2:
    st.metric("Current Price", f"${recent_stock['Adj Close'].iloc[-1]:.2f}")

with col3:
    prev_day_return = recent_stock['Adj Close'].pct_change().iloc[-1] * 100
    st.metric("Previous Day Return", f"{prev_day_return:.2f}%")

### Prediction Probability Visualization

In [None]:
# Display probability chart
st.subheader("Prediction Probabilities")

fig, ax = plt.subplots(figsize=(10, 5))

labels = model.classes_
probs = probabilities * 100

ax.bar(labels, probs, color=['red', 'gray', 'green'])
ax.set_ylabel('Probability (%)')
ax.set_ylim(0, 100)

for i, v in enumerate(probs):
    ax.text(i, v + 1, f"{v:.1f}%", ha='center')

st.pyplot(fig)

### Sentiment Trend Visualization

In [None]:
# Display recent sentiment
st.header("Recent Reddit Sentiment")

# Plot sentiment over time
fig, ax = plt.subplots(figsize=(10, 5))

sentiment_plot_data = recent_sentiment.sort_values('date')
ax.plot(sentiment_plot_data['date'], sentiment_plot_data['compound_sentiment_mean'], 
        marker='o', linestyle='-', label='Mean Sentiment')

ax.axhline(y=0, color='r', linestyle='--', alpha=0.3)
ax.set_ylabel('Compound Sentiment Score')
ax.set_title('Reddit Sentiment (Last 7 Days)')
ax.grid(True, alpha=0.3)

st.pyplot(fig)

### Stock Price Visualization

In [None]:
# Display recent stock price
st.header("Recent AMD Stock Price")

fig, ax = plt.subplots(figsize=(10, 5))

ax.plot(recent_stock.index, recent_stock['Adj Close'], label='AMD')
ax.set_ylabel('Price ($)')
ax.set_title('AMD Stock Price (Last 30 Days)')
ax.grid(True, alpha=0.3)

st.pyplot(fig)

### Raw Data Display

In [None]:
# Display raw data
with st.expander("View Raw Sentiment Data"):
    st.dataframe(recent_sentiment)

with st.expander("View Raw Stock Data"):
    st.dataframe(recent_stock)

### Conclusion

This project has built a lightweight, reproducible pipeline that:
1. Collects Reddit posts and comments from r/AMD and r/StockMarket
2. Analyzes sentiment using the VADER lexicon
3. Merges sentiment data with AMD stock prices
4. Trains a multinomial logistic regression model to predict price direction
5. Creates a Streamlit dashboard for visualization

The model achieves [metrics] performance, which is [comparison] to the naive baseline.

### Limitations
- VADER may not accurately capture sarcasm and slang
- Reddit API availability and rate limits
- Class imbalance in stock price movements
- Short-term market movements are inherently noisy

### Next Steps
- Deploy the Streamlit dashboard
- Implement daily data collection for real-time predictions
- Explore more advanced NLP techniques
- Add more technical indicators as features
- Test the approach on other stocks