# Breakout Stock Classifier: Scaffolding and Expansion

This notebook scaffolds a modular backend for a breakout stock classifier, breaks out model components, adds data point functionality, displays and edits the training cell, and updates the workflow to handle more stocks.

## Google Colab: Uploading Your CSV

If you are using Google Colab, you can upload your `stocks-list.csv` file directly to the Colab runtime with the following code cell:

```python
from google.colab import files
uploaded = files.upload()  # This will prompt you to select and upload your CSV file
```

- After uploading, the file will be in the current working directory.
- If your code expects the file in a `data/` folder, move it with:

```python
import os
os.makedirs('data', exist_ok=True)
os.replace('stocks-list.csv', 'data/stocks-list.csv')
```

Alternatively, you can mount your Google Drive and access files from there:

```python
from google.colab import drive
drive.mount('/content/drive')
# Then use the path: '/content/drive/My Drive/path/to/stocks-list.csv'
```

Adjust your code to use the correct path depending on your upload method.

## 1. Scaffold Model Backend
Set up the basic backend structure for the breakout classifier, including imports and class/function definitions.

In [None]:
# Imports and backend scaffolding
import pandas as pd
import numpy as np
from typing import List, Dict
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import joblib

# Placeholder for backend class
class BreakoutStockClassifier:
    def __init__(self):
        self.model = None
        self.features = None
    
    def fit(self, X, y):
        self.model = XGBClassifier(n_estimators=200, max_depth=5, random_state=42)
        self.model.fit(X, y)
    
    def predict(self, X):
        return self.model.predict_proba(X)[:, 1] if self.model else None
    
    def save(self, path):
        joblib.dump(self.model, path)
    
    def load(self, path):
        self.model = joblib.load(path)


In [None]:
# --- Data download using your own ticker CSV (no Yahoo ticker scraping needed) ---
import yfinance as yf

# Function to download historical data for a list of tickers
# (tickers should come from your CSV, not Yahoo API)
def download_bulk_stock_data(tickers, start="2015-01-01", end=None, auto_adjust=True):
    data = {}
    for ticker in tickers:
        try:
            df = yf.download(ticker, start=start, end=end, progress=False, auto_adjust=auto_adjust)
            if not df.empty:
                # Standardize column names to lowercase
                df.columns = [col.lower() for col in df.columns]
                # Only keep if 'close' column exists
                if 'close' in df.columns:
                    df['symbol'] = ticker
                    data[ticker] = df.reset_index()
                else:
                    print(f"Ticker {ticker} missing 'close' column, skipping.")
            else:
                print(f"No data for ticker {ticker}, skipping.")
        except Exception as e:
            print(f"Failed to download {ticker}: {e}")
    if data:
        all_df = pd.concat(data.values(), ignore_index=True)
        return all_df
    else:
        return pd.DataFrame()

In [None]:
# --- Download and use a comprehensive ticker list from an external CSV ---
import os

def get_tickers_from_csv(csv_path: str) -> list:
    """Load a comprehensive list of tickers from an external CSV file."""
    import pandas as pd
    df = pd.read_csv(csv_path)    # Accept common column names for tickers
    for col in ['symbol', 'ticker', 'Ticker', 'SYMBOL', 'Symbol']:
        if col in df.columns:
            tickers = df[col].dropna().unique().tolist()
            return tickers
    raise ValueError(f"No ticker column found in {csv_path}. Columns found: {df.columns.tolist()}")

# Example usage:
# Download a full US stock list from NASDAQ, NYSE, AMEX, or use a third-party source like 'eodhistoricaldata.com', 'nasdaqtrader.com', or 'stockanalysis.com'.
# Place the CSV in your data directory, e.g., 'data/all_us_tickers.csv'.
# all_tickers = get_tickers_from_csv('data/all_us_tickers.csv')


## 2. Break Out Model Components
Separate the workflow into modular functions for data loading, preprocessing, model definition, and evaluation.

In [None]:
# Data loading function
def load_stock_data(csv_path: str) -> pd.DataFrame:
    """Load historical stock data from CSV or other sources."""
    return pd.read_csv(csv_path)

# Preprocessing function
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Feature engineering and breakout labeling."""
    df = df.copy()
    # Example: Calculate 30-day forward return
    df['forward_return_30d'] = df['close'].shift(-30) / df['close'] - 1
    # Label breakout: 1 if return > 0.6 (60%), else 0
    df['breakout'] = (df['forward_return_30d'] > 0.6).astype(int)
    return df

# Model evaluation function
def evaluate_model(model, X_test, y_test):
    from sklearn.metrics import roc_auc_score, classification_report
    y_pred = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred)
    print(f"ROC AUC: {auc:.3f}")
    print(classification_report(y_test, y_pred > 0.5))

## 3. Add Data Point Functionality
Implement a function to add or update individual data points for training or testing.

In [None]:
# Function to add or update a data point
def add_data_point(df: pd.DataFrame, new_row: Dict) -> pd.DataFrame:
    """Add or update a single data point in the DataFrame."""
    df = df.copy()
    # Assume 'date' and 'symbol' uniquely identify a row
    mask = (df['date'] == new_row['date']) & (df['symbol'] == new_row['symbol'])
    if mask.any():
        df.loc[mask, :] = pd.DataFrame([new_row])
    else:
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    return df

## 4. Display and Edit Training Cell
This cell trains the breakout classifier. You can edit model parameters or code as needed.

In [None]:
# Training cell: edit parameters as needed
# Load and preprocess data
df = load_stock_data('data/stocks-list.csv')

# Ensure columns are lowercase for consistency
df.columns = [col.lower() for col in df.columns]

# Check for 'close' column before proceeding
if 'close' not in df.columns:
    raise ValueError(f"'close' column not found in data. Columns available: {df.columns.tolist()}")

df = preprocess_data(df)

# Select features and target
features = ['open', 'high', 'low', 'close', 'volume']  # Add more features as needed
X = df[features]
y = df['breakout']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = BreakoutStockClassifier()
clf.fit(X_train, y_train)

# Evaluate
evaluate_model(clf.model, X_test, y_test)

# Save model
clf.save('breakout_classifier_xgb.pkl')


# --- Data download using your own ticker CSV (no Yahoo ticker scraping needed) ---
import yfinance as yf

# Function to download historical data for a list of tickers
# (tickers should come from your CSV, not Yahoo API)
def download_bulk_stock_data(tickers, start="2015-01-01", end=None):
    data = {}
    for ticker in tickers:
        try:
            df = yf.download(ticker, start=start, end=end, progress=False)
            if not df.empty:
                df['symbol'] = ticker
                data[ticker] = df.reset_index()
        except Exception as e:
            print(f"Failed to download {ticker}: {e}")
    if data:
        all_df = pd.concat(data.values(), ignore_index=True)
        return all_df
    else:
        return pd.DataFrame()

In [None]:
# Example: Load and process multiple stock files efficiently
import glob

all_files = glob.glob('data/stocks/*.csv')  # Folder with many stock CSVs
all_dfs = []
for file in all_files:
    df = load_stock_data(file)
    df = preprocess_data(df)
    all_dfs.append(df)

big_df = pd.concat(all_dfs, ignore_index=True)

# Continue as before with big_df
X = big_df[features]
y = big_df['breakout']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = BreakoutStockClassifier()
clf.fit(X_train, y_train)
evaluate_model(clf.model, X_test, y_test)
clf.save('breakout_classifier_xgb_large.pkl')

In [None]:
# --- Refined: Download and process thousands of tickers from Yahoo Finance ---
# 1. Get a large universe of tickers
all_tickers = get_all_us_tickers()
print(f"Total tickers: {len(all_tickers)}")

# 2. Download historical data for all tickers (can take a long time, consider batching)
bulk_df = download_bulk_stock_data(all_tickers[:1000], start="2018-01-01")  # Use a subset for demo

# 3. Preprocess and label breakouts
bulk_df = preprocess_data(bulk_df)

# 4. Continue as before
features = ['open', 'high', 'low', 'close', 'volume']
X = bulk_df[features]
y = bulk_df['breakout']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = BreakoutStockClassifier()
clf.fit(X_train, y_train)
evaluate_model(clf.model, X_test, y_test)
clf.save('breakout_classifier_xgb.pkl')

In [None]:
# --- Load tickers from data/stocks-list.csv and use for training ---
from pathlib import Path

# Path to your downloaded ticker CSV
csv_path = Path('data/stocks-list.csv')

# Load tickers
all_tickers = get_tickers_from_csv(str(csv_path))
print(f"Loaded {len(all_tickers)} tickers from {csv_path}")

# Download historical data for these tickers (batch or subset for demo)
bulk_df = download_bulk_stock_data(all_tickers[:1000], start="2018-01-01")  # Adjust slice as needed

# Preprocess and label breakouts
bulk_df = preprocess_data(bulk_df)

# Select features and target
features = ['open', 'high', 'low', 'close', 'volume']
X = bulk_df[features]
y = bulk_df['breakout']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = BreakoutStockClassifier()
clf.fit(X_train, y_train)

evaluate_model(clf.model, X_test, y_test)

# Save model
clf.save('breakout_classifier_xgb_stockslist.pkl')