In [None]:
import re
import time
from datetime import date
from io import StringIO

import gdown
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
import requests
import yfinance as yf

In [2]:
pl.Config.set_tbl_rows(100)

polars.config.Config

## Question 1

In [None]:
def get_withdrawn_ipos() -> pd.DataFrame:
    url = "https://stockanalysis.com/ipos/withdrawn/"
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/58.0.3029.110 Safari/537.3"
        )
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Wrap HTML text in StringIO to avoid deprecation warning
        # "Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object."
        html_io = StringIO(response.text)
        tables = pd.read_html(html_io)

        if not tables:
            raise ValueError("No tables found.")

        return tables[0]

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
    except ValueError as ve:
        print(f"Data error: {ve}")
    except Exception as ex:
        print(f"Unexpected error: {ex}")

    return pd.DataFrame()

In [4]:
withdrawn_ipo_df = pl.from_pandas(get_withdrawn_ipos())

In [None]:
def classify_company(name):
    if name is None:
        return "Other"

    cleaned_name = re.sub(r"[^\w\s]", "", name)
    words = cleaned_name.lower().split()  # Normalize text and split into words

    if "acquisition" in words and ("corp" in words or "corporation" in words):
        return "Acq.Corp"
    elif "inc" in words or "incorporated" in words:
        return "Inc"
    elif "group" in words:
        return "Group"
    elif "holdings" in words:
        return "Holdings"
    elif "ltd" in words or "limited" in words:
        return "Ltd"
    else:
        return "Other"


In [None]:
withdrawn_ipo_df = withdrawn_ipo_df.with_columns(
    pl.col("Company Name")
    .map_elements(classify_company, return_dtype=pl.String)
    .alias("Company Class")
)

In [7]:
def parse_avg_price(price):
    if price == "-" or price is None:
        return None
    prices = re.findall(r"\d+\.\d+", price)  # Extract numerical values
    prices = list(map(float, prices))  # Convert to float
    return sum(prices) / len(prices) if prices else None  # Compute average


In [None]:
withdrawn_ipo_df = withdrawn_ipo_df.with_columns(
    pl.col("Price Range")
    .map_elements(parse_avg_price, return_dtype=pl.Float64)
    .alias("Avg. Price")
)


In [None]:
withdrawn_ipo_df = withdrawn_ipo_df.with_columns(
    pl.col("Shares Offered").cast(pl.Int64, strict=False)
)

In [10]:
withdrawn_ipo_df = withdrawn_ipo_df.with_columns(
    (pl.col("Shares Offered") * pl.col("Avg. Price")).alias("Withdrawn Value")
)

In [11]:
withdrawn_ipo_df.group_by("Company Class").agg(
    pl.col("Withdrawn Value").sum().alias("Total Withdrawn Value")
).sort("Total Withdrawn Value", descending=True)


Company Class,Total Withdrawn Value
str,f64
"""Acq.Corp""",4021000000.0
"""Inc""",2257200000.0
"""Other""",767919999.0
"""Ltd""",321730000.0
"""Holdings""",303000000.0
"""Group""",33787500.0


## Question 2

In [None]:
def get_ipos_by_year(year: int) -> pd.DataFrame:
    """
    Fetch IPO data for the given year from stockanalysis.com.
    """
    url = f"https://stockanalysis.com/ipos/{year}/"
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/58.0.3029.110 Safari/537.3"
        )
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Wrap HTML text in StringIO to avoid deprecation warning
        # "Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object."
        html_io = StringIO(response.text)
        tables = pd.read_html(html_io)

        if not tables:
            raise ValueError(f"No tables found for year {year}.")

        return tables[0]

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
    except ValueError as ve:
        print(f"Data error: {ve}")
    except Exception as ex:
        print(f"Unexpected error: {ex}")

    return pd.DataFrame()

In [13]:
ipo_df = pl.from_pandas(get_ipos_by_year(2024))

In [None]:
ipo_df = ipo_df.with_columns(pl.col("IPO Date").str.to_date(format="%b %d, %Y")).filter(
    pl.col("IPO Date") < date(2024, 6, 1), ~pl.col("IPO Price").str.contains("-")
)

In [16]:
ipo_df.head()

IPO Date,Symbol,Company Name,IPO Price,Current,Return
date,str,str,str,str,str
2024-05-23,"""BOW""","""Bowhead Specialty Holdings Inc…","""$17.00""","""$36.49""","""114.65%"""
2024-05-17,"""HDL""","""Super Hi International Holding…","""$19.56""","""$18.70""","""-4.40%"""
2024-05-17,"""RFAI""","""RF Acquisition Corp II""","""$10.00""","""$10.60""","""6.00%"""
2024-05-15,"""JDZG""","""JIADE Limited""","""$4.00""","""$0.26""","""-93.40%"""
2024-05-15,"""RAY""","""Raytech Holding Limited""","""$4.00""","""$1.23""","""-69.25%"""


In [17]:
tickers = ipo_df["Symbol"].to_list()

In [None]:
stocks_df_list = []
for ticker in tickers:
    ticker_obj = yf.Ticker(ticker)
    historyPrices = ticker_obj.history(period="max", interval="1d")
    df = pl.from_pandas(historyPrices, include_index=True)
    df = df.with_columns(pl.col("Date").cast(pl.Date))
    df = df.with_columns(
        pl.col("Date").dt.year().alias("Year"),
        pl.col("Date").dt.month().alias("Month"),
        pl.col("Date").dt.day().alias("Day"),
        pl.col("Date").dt.weekday().alias("Weekday"),
    )
    expressions = [
        (pl.col("Close") / pl.col("Close").shift(i)).alias(f"growth_{i}d")
        for i in [1, 3, 7, 30, 90, 252, 365]
    ]
    df = df.with_columns(expressions)
    df = df.with_columns(
        (pl.col("Close").shift(-5) / pl.col("Close")).alias("growth_future_30d")
    )
    expressions = [
        (pl.col("Close").shift(-21 * i) / pl.col("Close")).alias(f"future_growth_{i}m")
        for i in range(1, 13, 1)
    ]
    df = df.with_columns(expressions)
    df = df.with_columns(
        pl.col("Close").rolling_mean(window_size=10).alias("SMA10"),
        pl.col("Close").rolling_mean(window_size=20).alias("SMA20"),
    )
    df = df.with_columns(
        (pl.col("SMA10") > pl.col("SMA20"))
        .cast(pl.Int8)
        .alias("growing_moving_average")
    )
    df = df.with_columns(
        ((pl.col("High") - pl.col("Low")) / pl.col("Close")).alias(
            "high_minus_low_relative"
        )
    )
    df = df.with_columns(
        (pl.col("Close").rolling_std(window_size=30) * np.sqrt(252)).alias("volatility")
    )
    df = df.with_columns(
        (pl.col("growth_future_30d") > 1)
        .cast(pl.Int8)
        .alias("is_positive_growth_30d_future")
    )
    df = df.with_columns(
        ((pl.col("growth_252d") - 0.045) / pl.col("volatility")).alias("Sharpe")
    )
    df = df.with_columns(ticker=pl.lit(ticker))
    df = df.with_columns(min_date=pl.col("Date").min())
    stocks_df_list.append(df)

    time.sleep(1)

In [19]:
stocks_df = pl.concat(stocks_df_list)

In [None]:
stocks_df_filtered = stocks_df.filter(pl.col("Date") == date(2025, 6, 6))

In [None]:
stocks_df_filtered.select(pl.col(["Sharpe"])).filter(
    pl.col("Sharpe").is_not_null()
).select(pl.col("Sharpe").median())

Sharpe
f64
0.083768


In [None]:
stocks_df_filtered.select(pl.col(["ticker", "Sharpe"])).drop_nulls().sort(
    by="Sharpe", descending=True
).head(10)

ticker,Sharpe
str,f64
"""BKHA""",2.835668
"""JVSA""",2.041531
"""LEGT""",1.940267
"""IBAC""",1.637119
"""HLXB""",1.123493
"""MNDR""",0.974234
"""DYCQ""",0.969321
"""INTJ""",0.744512
"""JL""",0.566222
"""TRSG""",0.51808


In [None]:
stocks_df_filtered.select(pl.col(["ticker", "growth_252d"])).drop_nulls().sort(
    by="growth_252d", descending=True
).head(10)

ticker,growth_252d
str,f64
"""JL""",8.097413
"""ROMA""",6.156406
"""UMAC""",4.966533
"""NNE""",4.655224
"""RBRK""",3.184065
"""AHR""",2.483097
"""AS""",2.478203
"""MRX""",2.300384
"""RDDT""",2.225505
"""MTEN""",2.210432


## Question 3

In [None]:
stocks_df.filter(pl.col("Date") == pl.col("min_date")).select(
    cs.starts_with("future_growth_")
).describe()[2].drop("statistic").unpivot(
    variable_name="horizon", value_name="growth"
).sort("growth", descending=True)

horizon,growth
str,f64
"""future_growth_2m""",0.940544
"""future_growth_1m""",0.927259
"""future_growth_10m""",0.917945
"""future_growth_12m""",0.900861
"""future_growth_11m""",0.882534
"""future_growth_9m""",0.881779
"""future_growth_6m""",0.864185
"""future_growth_7m""",0.847149
"""future_growth_3m""",0.833824
"""future_growth_8m""",0.832983


## Question 4

In [None]:
file_id = "1grCTCzMZKY5sJRtdbLVCXg8JXA8VPyg-"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "data.parquet", quiet=False)
df_polars = pl.read_parquet("data.parquet", use_pyarrow=True).with_columns(
    pl.col(["Month", "Date", "Quarter"]).cast(pl.Date)
)


Downloading...
From (original): https://drive.google.com/uc?id=1grCTCzMZKY5sJRtdbLVCXg8JXA8VPyg-
From (redirected): https://drive.google.com/uc?id=1grCTCzMZKY5sJRtdbLVCXg8JXA8VPyg-&confirm=t&uuid=188101ac-c605-4758-86e1-d5dc4731b14d
To: c:\Users\kerim\github_repo\stock-markets-analytics-zoomcamp\homeworks\2025\02-dataframe-analysis\data.parquet
100%|██████████| 130M/130M [00:09<00:00, 14.1MB/s] 


In [None]:
rsi_threshold = 25
selected_df = df_polars.filter(
    (pl.col("rsi") < rsi_threshold)
    & (pl.col("Date") >= date(2000, 1, 1))
    & (pl.col("Date") <= date(2025, 6, 1))
)


In [None]:
result = selected_df.select(
    (1000 * (pl.col("growth_future_30d") - 1)).sum().alias("net_income")
)
print(f"Net income is ${result.item():.2f}")

Net income is $24295.52
