# Dataset
We will create our own dataset here that we can then use to train our fine tuned model.

1. Import the base dataset from HuggingFace.

In [3]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("shortbread/tickers", split="train")

# Select only the required columns and rename 'long_business_summary' to 'idea'
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ['symbol', 'name', 'close', 'volume', 'market_cap', 'long_business_summary']])
dataset = dataset.rename_column("long_business_summary", "idea")

# Display a few rows of the filtered and renamed dataset
print(dataset)


Dataset({
    features: ['symbol', 'name', 'close', 'volume', 'market_cap', 'idea'],
    num_rows: 7314
})


In [4]:
import pandas as pd
from pandas import DataFrame
import yfinance as yf
import time
import os

# Ensure the "Data" folder exists
os.makedirs("Data", exist_ok=True)

df = dataset.to_pandas()

# Parameters for date range and batch size
start_date = "2020-01-01"
end_date = "2023-01-01"
batch_size = 50  # Adjust batch size based on rate limits

# Limit dataset for testing
df = df[-100:]

# Initialize an empty list to store all data in the desired format
all_data = DataFrame(columns=['Ticker', 'Idea', 'StockPrice', 'StockPriceDevelopment',
                              'MarketCap', 'Volume'])

# Fetch data in batches
index = 0
for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i + batch_size]
    intermediate_index = 0
    for _, row in batch.iterrows():
        index = i * batch_size + intermediate_index
        intermediate_index += 1
        
        symbol = row['symbol']  # Access the ticker symbol

        try:

            if any(value == 'nan' for value in [row['idea'], row['close'], row['market_cap'], row['volume']]):
                print(f"Skipping {symbol} due to NaN values.")
                continue
                
            # Fetch historical stock data for the ticker
            stock_data = yf.download(symbol, start=start_date, end=end_date, interval="1d")

            # Skip tickers with no data
            if stock_data.empty:
                print(f"No data found for {symbol}. Skipping.")
                continue

            stock_price_development = ((stock_data['Close'].iloc[-1].item() - stock_data['Close'].iloc[0].item()) / stock_data['Close'].iloc[0].item()) * 100
        
        # Calculate stock price development as the percentage change over time
            entry = [
                row['symbol'], # Ticker
                row['idea'],
                stock_data['Close'].iloc[-1].item() if stock_data['Close'] is not None else row['close'],
                stock_price_development,
                row['market_cap'],
                row['volume'],
            ]

            if any(pd.isna(value) for value in entry):
                print(f"Skipping {symbol} due to NaN values.")
                continue
            
            # Append this company's enriched data to all_data list
            # pd.concat([DataFrame([entry], columns = all_data.columns), all_data], ignore_index=True)
            all_data.loc[index] = entry

            # Delay to avoid hitting rate limits
            time.sleep(0.5)

        except Exception as e:
            print(f"Failed to fetch data for {symbol}: {e}")

    # Delay between batches to avoid rate limiting
    time.sleep(2)

all_data.to_csv('Data/company_stock_prices.csv', index=False)

print("Data collection complete. Final CSV file saved in 'Data/company_stock_prices.csv'")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Skipping YELP due to NaN values.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['YGF']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed


No data found for YGF. Skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Skipping YORW due to NaN values.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['YS']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')


No data found for YS. Skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Skipping YY due to NaN values.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZEV']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')


No data found for ZEV. Skipping.


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZFOX']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed


No data found for ZFOX. Skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZING']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed


No data found for ZING. Skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZJYL']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2020-01-01 -> 2023-01-01) (Yahoo error = "Data doesn\'t exist for startDate = 1577854800, endDate = 1672549200")')
[*********************100%***********************]  1 of 1 completed


No data found for ZJYL. Skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZLS']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')


No data found for ZLS. Skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZURA']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  

No data found for ZURA. Skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ZYNE']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed


No data found for ZYNE. Skipping.
Data collection complete. Final CSV file saved in 'Data/company_stock_prices.csv'


2. Now we can use the tickers (2. column) form the imported dataset to use the yfinance api to fetch stock data on the companies:

### Now we have fetched the data we need to calculate initial scores for our sample data. 

## Initial Score Calculation
We have the three properties of a company that should affect the final score of its initial idea:
- Current StockPrice
- StockPrice development over the last few years
- MarketCap
- Volume

The **current stock price** is an indecator for the relevance of an idea, allthough its not enough on its own, since an idea might have been revolutionary some decades ago, when the business established itself. Therefore the stockprice development comes in handy, as it indicates if the company has grown in recent times. New and innovative firms are more likely to have experienced an growth in the past years. If we set in relation to the market cap and the companys valume we get a more detailed overview on how relevant and successful the core idea of an company would be nowadays.

So i have come up with the following relations:

### Initial Function: Creative Relevance Score

This function combines **recent growth**, **market stability**, and **market confidence** to create a balanced score for a company's relevance and success:

- **Recent Growth**: Emphasizes companies with strong recent stock price growth and trading volume, highlighting momentum in the market.
- **Market Presence**: Rewards companies with high stock price and market cap but adjusts for extremely high or low stock prices, which might signal volatility or overvaluation.
- **Market Confidence**: A combined measure of stock price and market cap, weighted by recent stock price growth to favor companies with both high value and growth.

**Formula**:
$$
\text{Score} = 10 \times \left( 0.4 \times \sqrt{\max(D_{norm} \times V_{norm}, 0)} + 0.3 \times \frac{M_{norm} \times S_{norm}}{1 + |S_{norm} - 0.5|} + 0.3 \times (S_{norm} + M_{norm}) \times (0.5 + 0.5 \times D_{norm}) \right)
$$

- **Normalization**: Each component is normalized to avoid extreme values, ensuring a balanced score between 0 and 10.
- **Safeguards**: The formula includes protections against invalid operations (e.g., taking the square root of negative values) by setting minimum values to prevent `NaN` or infinity results.

In [5]:
import numpy as np

# Define the creative relevance score function
def creative_relevance_score(stock_price, stock_price_dev, market_cap, volume,
                             S_max=1000, M_max=2e12, V_max=1e9, epsilon=1e-9):
    # Normalize each component
    S_norm = min(stock_price, S_max) / S_max
    D_norm = min(stock_price_dev, 100) / 100
    M_norm = np.log(market_cap + 1) / np.log(M_max)  # Adding 1 to avoid log(0)
    V_norm = np.log(volume + 1) / np.log(V_max)      # Adding 1 to avoid log(0)

    # Safeguard against any negative or NaN values
    D_norm = max(D_norm, 0)  # Ensure D_norm is not negative
    V_norm = max(V_norm, 0)  # Ensure V_norm is not negative

    # Calculate each component's contribution with safeguards
    recent_growth = np.sqrt(max(D_norm * V_norm, 0))  # Avoid sqrt of negative
    market_presence = (M_norm * S_norm) / (1 + abs(S_norm - 0.5))  # Adjusted for extreme prices
    market_confidence = (S_norm + M_norm) * (0.5 + 0.5 * D_norm)  # Weighted by stock growth

    # Final score with weights
    score = 10 * (0.4 * recent_growth + 0.3 * market_presence + 0.3 * market_confidence)
    print("Final Score: " + str(score))
    return max(0, min(score, 10))  # Ensure the score is within [0, 10] Ensure the score is within [0, 10]

# Example usage
score = creative_relevance_score(stock_price=150, stock_price_dev=50, market_cap=5e11, volume=1e8)
print("Creative Relevance Score:", score)




Final Score: 5.461061674436522
Creative Relevance Score: 5.461061674436522


### Formula 1: Growth-Adjusted Market Confidence

This formula emphasizes **market stability** and **growth potential**:
- **Growth-Adjusted Market Presence**: Highlights high stock price and market cap, adjusted by growth to favor companies with balanced growth.
- **Recent Growth**: Captures recent market excitement by emphasizing strong growth and high trading volume.
- **Market Cap-Volume Interaction**: Rewards companies with both stability (high market cap) and high volume, indicating established interest.

**Formula**:
$$
\text{Score} = 10 \times \left( 0.5 \times \frac{S_{norm} \times M_{norm}}{1 + |D_{norm} - 0.5|} + 0.3 \times \sqrt{D_{norm} \times V_{norm}} + 0.2 \times (M_{norm} \times V_{norm})² \right)
$$

In [6]:
def alternative_score_1(stock_price, stock_price_dev, market_cap, volume,
                        S_max=1000, M_max=2e12, V_max=1e9):
    S_norm = min(stock_price, S_max) / S_max
    D_norm = min(stock_price_dev, 100) / 100
    M_norm = np.log(market_cap + 1) / np.log(M_max)
    V_norm = np.log(volume + 1) / np.log(V_max)

    growth_adjusted_presence = (S_norm * M_norm) / (1 + abs(D_norm - 0.5))
    recent_growth = np.sqrt(max(D_norm * V_norm, 0))
    market_cap_volume = (M_norm * V_norm) * (M_norm * V_norm)

    score = 10 * (0.5 * growth_adjusted_presence + 0.3 * recent_growth + 0.2 * market_cap_volume)
    return max(0, min(score, 10))


### Formula 2: Stability and Interest Ratio

This formula uses **ratios** to balance **current stability** and **market interest**:
- **Stability-Interest Ratio**: Rewards companies with high stability (stock price and market cap), moderated by trading volume.
- **Growth Emphasis**: Directly highlights recent stock price growth as an indicator of market excitement.
- **Stability and Interest Interaction**: Rewards companies with both high market cap and volume, favoring established companies with active interest.

**Formula**:
$$
\text{Score} = 10 \times \left( 0.4 \times \frac{S_{norm} \times M_{norm}}{1 + |V_{norm} - 0.5|} + 0.4 \times D_{norm} + 0.2 \times \sqrt{M_{norm} \times V_{norm}} \right)
$$


In [7]:
def alternative_score_2(stock_price, stock_price_dev, market_cap, volume,
                        S_max=1000, M_max=2e12, V_max=1e9):
    S_norm = min(stock_price, S_max) / S_max
    D_norm = min(stock_price_dev, 100) / 100
    M_norm = np.log(market_cap + 1) / np.log(M_max)
    V_norm = np.log(volume + 1) / np.log(V_max)

    stability_interest_ratio = (S_norm * M_norm) / (1 + abs(V_norm - 0.5))
    growth_emphasis = D_norm
    stability_interest_interaction = np.sqrt(M_norm * V_norm)

    score = 10 * (0.4 * stability_interest_ratio + 0.4 * growth_emphasis + 0.2 * stability_interest_interaction)
    return max(0, min(score, 10))


### Formula 3: Momentum and Legacy

This formula emphasizes **momentum** by weighing **recent growth** and **trading volume** more heavily, while acknowledging **market cap** and **current stock price** for stability:
- **Stock Price and Market Cap Weighting**: Balances stock price and market cap, representing a company’s legacy and current market position.
- **Momentum Factor**: Highlights companies with strong recent growth and volume, capturing momentum.
- **Adjusted Growth Factor**: Rewards high-growth companies but tempers the effect if market cap significantly deviates, ensuring a preference for stable growth.

**Formula**:
$$
\text{Score} = 10 \times \left( (0.25 \times S_{norm} + 0.25 \times M_{norm})^{1 + D_{norm}} + 0.3 \times \sqrt{D_{norm} \times V_{norm}} + 0.2 \times \frac{D_{norm} \times S_{norm}}{1 + |M_{norm} - 0.5|} \right)
$$


In [8]:
import math
def alternative_score_3(stock_price, stock_price_dev, market_cap, volume,
                        S_max=1000, M_max=2e12, V_max=1e9):
    S_norm = min(stock_price, S_max) / S_max
    D_norm = min(stock_price_dev, 100) / 100
    M_norm = np.log(market_cap + 1) / np.log(M_max)
    V_norm = np.log(volume + 1) / np.log(V_max)

    stock_price_market_cap_weighting = 0.25 * S_norm + 0.25 * M_norm
    momentum_factor = np.sqrt(max(D_norm * V_norm, 0))
    adjusted_growth_factor = (D_norm * S_norm) / (1 + abs(M_norm - 0.5))

    score = 10 * (math.pow(stock_price_market_cap_weighting, 1 + D_norm) + 0.3 * momentum_factor + 0.2 * adjusted_growth_factor)
    return max(0, min(score, 10))


##### We will use this function now to create a final dataset with the idea, which we use the company description for, and the according score.

In [9]:
import os
import pandas as pd
os.makedirs("Data", exist_ok=True)

input_filepath = 'Data/company_stock_prices.csv'
df = pd.read_csv(input_filepath)

# Ensure numeric types and handle missing values
df[['StockPrice', 'StockPriceDevelopment', 'MarketCap', 'Volume']] = df[
    ['StockPrice', 'StockPriceDevelopment', 'MarketCap', 'Volume']
].apply(pd.to_numeric, errors='coerce').fillna(0)  # COnvert to numeric

# Calculate scores and prepare the output DataFrame
output_data = {
    'index': range(len(df)),  # Simple counter for each row
    'Idea': df['Idea'],       # Directly from input CSV
    'Score': df.apply(lambda row: alternative_score_3(
        row['StockPrice'], row['StockPriceDevelopment'], row['MarketCap'], row['Volume']), axis=1)  # Apply the scoring function
}

# Create the output DataFrame
output_df = pd.DataFrame(output_data)

# Save the results to a new CSV file
output_filepath = 'Data/final_dataset.csv'
output_df.to_csv(output_filepath, index=False)

print(f"Data collection complete. Final CSV file saved in '{output_filepath}'")

Data collection complete. Final CSV file saved in 'Data/final_dataset.csv'
