# Hybrid Mode - Dataset Creation 
we want to use the following structure to train our model:

Input:
- idea_text: "AI-powered e-commerce platform..."
- static_features: [market_size, investment, competition, team_strength]
- historical_stock: [month_1, month_2, ..., month_36]

Target:
- stock_performance (next 12 months): [month_37, month_38, ..., month_48]

---


This should then produce the follwoing output with our model:

Input:
- idea_text: "AI-powered e-commerce platform..."

Optional Input:
- static_features: [market_size, investment, competition, team_strength]

Output:
- Predicted stock performance (next 12 months): [month_1, month_2, ..., month_12]


### Huggingface Dataset
We will use the shortbread/tickers dataset to get tickers of US companies

In [1]:
from datasets import load_dataset

# Load the dataset
def get_tickers():
    initial_dataset = load_dataset("shortbread/tickers", split="train")

    # Select only the required columns and rename 'long_business_summary' to 'idea'
    updated_dataset = initial_dataset.remove_columns([col for col in initial_dataset.column_names if col not in ['symbol']])
    
    dataset_array = updated_dataset['symbol']

    # Display a few rows of the filtered and renamed dataset
    print(dataset_array)
    return dataset_array 

### Fetching additional data
We use the dataset from Hugginface as startingpoint and will now provide methods to fetch additional data

In [2]:
import pandas as pd
import yfinance as yf
import os
import time

# Function to fetch stock performance data
def fetch_stock_performance(ticker, months=12):
    try:
        # Fetch historical stock data and convert to monthly
        stock = yf.Ticker(ticker)
        hist = stock.history(period="5y", interval="1mo")  # Monthly data for up to 3 years
        monthly_close = hist['Close'][-months:].tolist()  # Get the last `months` of data
        return monthly_close
    except Exception as e:
        print(f"Error fetching stock performance for {ticker}: {e}")
        return None

# Function to fetch business description
def fetch_business_description(ticker):
    try:
        stock = yf.Ticker(ticker)
        description = stock.info.get("longBusinessSummary", "No description available.")
        return description
    except Exception as e:
        print(f"Error fetching business description for {ticker}: {e}")
        return "No description available."

# Function to fetch market size (revenue)
def fetch_market_size(ticker):
    try:
        stock = yf.Ticker(ticker)
        # Revenue (Total Revenue TTM)
        revenue = stock.financials.loc["Total Revenue"].iloc[0]  # Most recent revenue
        return revenue
    except Exception as e:
        print(f"Error fetching market size for {ticker}: {e}")
        return 0

# Function to fetch investment (total assets)
def fetch_investment(ticker):
    try:
        stock = yf.Ticker(ticker)
        # Total assets (most recent)
        total_assets = stock.balance_sheet.loc["Total Assets"].iloc[0]
        return total_assets
    except Exception as e:
        print(f"Error fetching investment for {ticker}: {e}")
        return 0

# Function to calculate competition (market share)
def fetch_competition(ticker, industry_revenue):
    try:
        stock = yf.Ticker(ticker)
        # Revenue (Total Revenue TTM)
        revenue = stock.financials.loc["Total Revenue"].iloc[0]
        # Calculate market share
        market_share = revenue / industry_revenue if industry_revenue > 0 else 0
        competition_index = 1 - market_share  # Lower market share = more competition
        return competition_index
    except Exception as e:
        print(f"Error fetching competition for {ticker}: {e}")
        return 0

# Function to fetch team strength (number of employees)
def fetch_team_strength(ticker):
    try:
        stock = yf.Ticker(ticker)
        # Number of full-time employees
        employees = stock.info.get("fullTimeEmployees", 0)
        return employees
    except Exception as e:
        print(f"Error fetching team strength for {ticker}: {e}")
        return 0

# Main function to create the dataset
def create_real_dataset(tickers, months=12):
    data = []
    for ticker in tickers:
        print(f"Processing {ticker}...")

        # Fetch stock performance
        stock_performance = fetch_stock_performance(ticker, months)
        if not stock_performance:
            continue

        # Fetch business description
        description = fetch_business_description(ticker)

        # Extract static features
        market_size = fetch_market_size(ticker)
        investment = fetch_investment(ticker)
        team_strength = fetch_team_strength(ticker)

        # Combine all data into a dictionary
        data.append({
            "ticker": ticker,
            "business_description": description,
            "market_size": market_size,
            "investment": investment,
            "team_strength": team_strength,
            **{f"month_{i+1}_performance": sp for i, sp in enumerate(stock_performance)}
        })

    return pd.DataFrame(data)

def process_tickers_in_batches(tickers, batch_size=100, output_file="Data/real_company_stock_dataset.csv", months=24):
    # Process tickers in batches
    for i in range(0, len(tickers), batch_size):
        batch = tickers[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1} of {len(tickers) // batch_size + 1}...")

        # Process the batch using the existing method
        batch_data = create_real_dataset(batch, months=months)

        # Save the batch to the output file
        if not batch_data.empty:
            if not os.path.exists(output_file):
                print("Creating output file...")
                batch_data.to_csv(output_file, index=False)
            else:
                print("Appending to output file...")
                batch_data.to_csv(output_file, mode='a', header=False, index=False)

        print(f"Batch {i // batch_size + 1} processed and saved.")
        
        # Add a delay between batches to respect API rate limits
        time.sleep(10)  # Adjust as needed

# Example usage
if __name__ == "__main__":
    # Simulated list of 10,000 tickers
    fetched_tickers = get_tickers()

    # Process tickers in batches
    process_tickers_in_batches(fetched_tickers, batch_size=100)
    print("Dataset creation complete!")


['A', 'AA', 'AAC', 'AACG', 'AACI', 'AACT', 'AADI', 'AAIC', 'AAL', 'AAMC', 'AAME', 'AAN', 'AAOI', 'AAON', 'AAP', 'AAPL', 'AAT', 'AAU', 'AB', 'ABBV', 'ABC', 'ABCB', 'ABCL', 'ABCM', 'ABEO', 'ABEV', 'ABG', 'ABIO', 'ABL', 'ABLV', 'ABM', 'ABNB', 'ABOS', 'ABR', 'ABSI', 'ABST', 'ABT', 'ABUS', 'ABVC', 'AC', 'ACA', 'ACAB', 'ACAC', 'ACAD', 'ACAH', 'ACAQ', 'ACAX', 'ACB', 'ACBA', 'ACCD', 'ACCO', 'ACDC', 'ACEL', 'ACER', 'ACET', 'ACGL', 'ACGLO', 'ACGN', 'ACHC', 'ACHL', 'ACHR', 'ACHV', 'ACI', 'ACIC', 'ACIU', 'ACIW', 'ACLS', 'ACLX', 'ACM', 'ACMR', 'ACN', 'ACNB', 'ACNT', 'ACON', 'ACOR', 'ACP', 'ACR', 'ACRE', 'ACRO', 'ACRS', 'ACRV', 'ACRX', 'ACST', 'ACT', 'ACTG', 'ACU', 'ACV', 'ACVA', 'ACWV', 'ACXP', 'ADAG', 'ADAP', 'ADBE', 'ADC', 'ADCT', 'ADD', 'ADEA', 'ADER', 'ADES', 'ADEX', 'ADI', 'ADIL', 'ADM', 'ADMA', 'ADMP', 'ADN', 'ADNT', 'ADOC', 'ADP', 'ADPT', 'ADRT', 'ADSE', 'ADSK', 'ADT', 'ADTH', 'ADTN', 'ADTX', 'ADUS', 'ADV', 'ADVM', 'ADX', 'ADXN', 'AE', 'AEAE', 'AEE', 'AEF', 'AEG', 'AEHL', 'AEHR', 'AEI', 'AEI

$AAC: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing AACG...
Processing AACI...


$AACI: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")
AACT: Period '5y' is invalid, must be one of ['1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', 'ytd', 'max']


Processing AACT...
Processing AADI...
Processing AAIC...


$AAIC: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing AAL...
Processing AAMC...


$AAMC: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing AAME...
Processing AAN...
Processing AAOI...
Processing AAON...
Processing AAP...
Processing AAPL...
Processing AAT...
Processing AAU...


$AAU: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing AB...
Processing ABBV...
Processing ABC...


$ABC: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ABCB...
Processing ABCL...
Processing ABCM...


$ABCM: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ABEO...
Processing ABEV...
Processing ABG...
Processing ABIO...


$ABIO: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ABL...


ABLV: Period '5y' is invalid, must be one of ['1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', 'ytd', 'max']


Processing ABLV...
Processing ABM...
Processing ABNB...
Processing ABOS...
Processing ABR...
Processing ABSI...
Processing ABST...


$ABST: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ABT...
Processing ABUS...
Processing ABVC...
Processing AC...
Processing ACA...
Processing ACAB...
Processing ACAC...
Processing ACAD...
Processing ACAH...


$ACAH: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ACAQ...


$ACAQ: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ACAX...


$ACAX: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ACB...
Processing ACBA...


$ACBA: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ACCD...
Processing ACCO...
Processing ACDC...
Processing ACEL...
Processing ACER...


$ACER: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ACET...
Processing ACGL...
Processing ACGLO...
Processing ACGN...


$ACGN: possibly delisted; no price data found  (period=5y) (Yahoo error = "No data found, symbol may be delisted")


Processing ACHC...


KeyboardInterrupt: 