In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import os
import mlflow
from prefect import flow, task
from prefect.assets import materialize
from prefect.artifacts import create_table_artifact, create_markdown_artifact

STOCK_LIST = ['AAPL', 'GOOGL', 'MSFT', 'NVDA', 'AMZN', 'META']
INFER_RAW_DIR = "data/inference/raw"
INFER_PROCESSED_DIR = "data/inference/processed"
ENCODER_PATH = "encoder.joblib"
TODAY = datetime.today().strftime("%Y-%m-%d")
YESTERDAY = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")

def load_inference_data(stock_list=STOCK_LIST, day=TODAY):
    logger.info(f"Loading stock data at {day} for {len(stock_list)} stocks.")
    df_all = yf.download(stock_list, start=day, group_by="ticker", auto_adjust=True)
    all_data = {}
    
    for i, ticker in enumerate(stock_list):
        df = df_all.xs(ticker, axis=1, level="Ticker")
        df.columns.name = None
        df = df.reset_index()
        all_data[ticker] = df
        
        if i == 0:
            
            df_copy = df.copy()
            df_copy["Date"] = df_copy["Date"].astype(str)
            create_table_artifact(
                key=f"{ticker.lower()}-raw-head",
                table=df_copy.head().to_dict(orient="records"),
                description=f"Top 5 rows of raw {ticker} data"
            )
            
    markdown = f"""
# Raw Stock Data Summary
This document summarizes the raw stock data loaded from Yahoo Finance.
- **Date Range**: {all_data[stock_list[0]]['Date'].min()} to {all_data[stock_list[0]]['Date'].max()}
- **Number of Stocks**: {len(stock_list)}
- **Stocks Loaded**: {', '.join(stock_list)}
- **Shape of Data**: {all_data[stock_list[0]].shape if stock_list else 'N/A'}
"""
    create_markdown_artifact(
        key=f"raw-data-summary",
        markdown=markdown,
        description="Summary of raw stock data loaded from Yahoo Finance."
    )
    os.makedirs(RAW_EXPORT_DIR, exist_ok=True)
    for ticker, df in all_data.items():
        path = os.path.join(RAW_EXPORT_DIR, f"{ticker}.csv")
        df.to_csv(path, index=False)
        logger.info(f"Exported {ticker} data to {path}")
    return all_data

In [5]:
MLFLOW_DB_URI = "postgresql://neondb_owner:npg_x1OqnLgvpmZ9@ep-empty-dew-a1d7ga54-pooler.ap-southeast-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require"

mlflow.set_tracking_uri(MLFLOW_DB_URI)  # hoặc URI server của bạn

run_id = 'a99dc45005404d79b5e7e64adc339070'

run = mlflow.get_run(run_id)
print("Artifact URI:", run.info.artifact_uri)

Artifact URI: /mnt/e/wsl_data/github/stock_forecast/mlruns/1/a99dc45005404d79b5e7e64adc339070/artifacts


In [16]:
mlflow.search_experiments()

[<Experiment: artifact_location='s3://zoomcamps-bucket/mlflow/zoomcamps-2', creation_time=1753287540043, experiment_id='7', last_update_time=1753287540043, lifecycle_stage='active', name='zoomcamps-2', tags={}>,
 <Experiment: artifact_location='s3://zoomcamps-bucket/mlflow/zoomcamps-1', creation_time=1753287268295, experiment_id='6', last_update_time=1753287268295, lifecycle_stage='active', name='zoomcamps-1', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1753278376119, experiment_id='0', last_update_time=1753278376119, lifecycle_stage='active', name='Default', tags={}>]

In [11]:
import mlflow
from urllib.parse import urlparse

def create_versioned_experiment(base_name="stock-prediction"):
    existing_experiments = mlflow.search_experiments()
    
    max_version = 0
    for exp in existing_experiments:
        if exp.name.startswith(base_name):
            suffix = exp.name[len(base_name):]  # phần sau prefix
            if suffix.startswith("-") and suffix[1:].isdigit():
                version = int(suffix[1:])
                max_version = max(max_version, version)
    
    new_version = max_version + 1
    new_exp_name = f"{base_name}-{new_version}"
    
    # Artifact location trên S3
    artifact_location = f"s3://zoomcamps-bucket/mlflow/{new_exp_name}"

    # Tạo experiment
    experiment_id = mlflow.create_experiment(
        name=new_exp_name,
        artifact_location=artifact_location
    )
    
    print(f"✅ Created experiment: {new_exp_name} (id: {experiment_id})")
    return new_exp_name

create_versioned_experiment()

✅ Created experiment: stock-prediction-2 (id: 4)


'stock-prediction-2'

In [17]:
def get_latest_versioned_experiment(base_name="zoomcamps"):
    existing_experiments = mlflow.search_experiments()
    
    max_version = -1
    latest_name = None

    for exp in existing_experiments:
        if exp.name.startswith(base_name):
            suffix = exp.name[len(base_name):]
            if suffix.startswith("-") and suffix[1:].isdigit():
                version = int(suffix[1:])
                if version > max_version:
                    max_version = version
                    latest_name = exp.name
    
    return latest_name

get_latest_versioned_experiment()

'zoomcamps-2'

In [51]:
stock.quarterly_income_stmt.shape

(43, 7)

In [11]:
stock.quarterly_income_stmt.index

Index(['Tax Effect Of Unusual Items', 'Tax Rate For Calcs',
       'Normalized EBITDA', 'Total Unusual Items',
       'Total Unusual Items Excluding Goodwill',
       'Net Income From Continuing Operation Net Minority Interest',
       'Reconciled Depreciation', 'Reconciled Cost Of Revenue', 'EBITDA',
       'EBIT', 'Net Interest Income', 'Interest Expense', 'Interest Income',
       'Normalized Income',
       'Net Income From Continuing And Discontinued Operation',
       'Total Expenses', 'Total Operating Income As Reported',
       'Diluted Average Shares', 'Basic Average Shares', 'Diluted EPS',
       'Basic EPS', 'Diluted NI Availto Com Stockholders',
       'Net Income Common Stockholders', 'Net Income',
       'Net Income Including Noncontrolling Interests',
       'Net Income Continuous Operations', 'Tax Provision', 'Pretax Income',
       'Other Income Expense', 'Other Non Operating Income Expenses',
       'Special Income Charges', 'Write Off', 'Gain On Sale Of Security',
  

In [12]:
stock.calendar

{'Dividend Date': datetime.date(2025, 3, 13),
 'Ex-Dividend Date': datetime.date(2025, 2, 20),
 'Earnings Date': [datetime.date(2025, 4, 24), datetime.date(2025, 4, 29)],
 'Earnings High': 3.56,
 'Earnings Low': 3.16,
 'Earnings Average': 3.22615,
 'Revenue High': 69897682310,
 'Revenue Low': 67140000000,
 'Revenue Average': 68534440260}

- Dividend Date: Ngày chi trả cổ tức tiếp theo.
- Ex-Dividend Date: Ngày giao dịch không hưởng quyền cổ tức.
- Earnings Date: Khoảng thời gian công ty dự kiến công bố báo cáo thu nhập: Đây là thời điểm quan trọng vì giá cổ phiếu thường biến động mạnh sau khi công ty công bố lợi nhuận.

- Earnings High: Dự báo EPS (Lợi nhuận trên mỗi cổ phiếu) cao nhất từ các nhà phân tích.
- Earnings Low: Dự báo EPS thấp nhất.
- Earnings Average: Trung bình dự báo EPS.

- Revenue High: Doanh thu dự kiến cao nhất.
- Revenue Low: Doanh thu dự kiến thấp nhất.
- Revenue Average: Trung bình dự báo doanh thu.

In [49]:
# stock.info

In [9]:
stock.analyst_price_targets

{'current': 396.99,
 'high': 650.0,
 'low': 420.0,
 'mean': 508.051,
 'median': 500.0}

- current: Giá hiện tại của cổ phiếu trên thị trường.
- high: Mức giá cao nhất mà một số nhà phân tích dự đoán cổ phiếu có thể đạt được.
- low: Mức giá thấp nhất mà một số nhà phân tích dự đoán.
- mean: Mức giá trung bình từ dự báo của nhiều nhà phân tích.
- median: Mức giá trung vị trong dự báo của các nhà phân tích.
6 - 12 tháng trong tương lai

In [13]:
stock.eps_trend

Unnamed: 0_level_0,current,7daysAgo,30daysAgo,60daysAgo,90daysAgo
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0q,3.21293,3.21472,3.16529,3.17325,3.17383
+1q,3.33131,3.33346,3.29005,3.29549,3.2934
0y,13.15656,13.15656,13.02874,13.04388,13.08535
+1y,15.07137,15.07137,15.06393,15.08038,15.10941
