# Install Packages

In [0]:
!pip install ta

# Import Packages

In [0]:
import sys
sys.path.append("/Workspace/Shared/lib/")
import os
import logging
import time
import requests
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import expr
from params import get_env, get_catalog, get_schema, get_table
from ta import * # add_all_ta_features
from ta.utils import dropna
import ta

# Logging

In [0]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(name)s [%(levelname)s] %(message)s",
    stream=sys.stdout,
    force=True,
)

# Params

In [0]:
env, catalog_suffix = get_env()
catalog = get_catalog()
schema = get_schema()
table = get_table()

print(f"env = {env}")
print(f"catalog_suffix = {catalog_suffix}")
print(f"catalog = {catalog}")
print(f"schema = {schema}")
print(f"table = {table}")

# Single Example

In [0]:
df_nvda = spark.sql(f"select * FROM featlib{catalog_suffix}.components.pricing where act_symbol = 'NVDA' order by date")
df_nvda_pd = df_nvda.toPandas()

# Add ta features filling NaN values
df_nvda_pd_feat = add_all_ta_features(df_nvda_pd, open="open", high="high", low="low", close="close", volume="volume", fillna=False)\
    .drop(['open', 'high', 'low', 'close', 'volume'], axis=1)

df_nvda_pd_feat.display()

In [0]:
df_nvda_pd_feat_melted = df_nvda_pd_feat.melt(id_vars=["date", "act_symbol"], var_name="id", value_name="values")
display(df_nvda_pd_feat_melted)

In [0]:
df_nvda_pd_feat_spark = spark.createDataFrame(df_nvda_pd_feat)
display(df_nvda_pd_feat_spark)

In [0]:
schema_str = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in df_nvda_pd_feat_spark.schema.fields])
schema = f"schema=\"{schema_str}\""
schema

In [0]:
from pyspark.sql.functions import expr

# Get a list of the columns you want to unpivot
# Exclude 'date' and 'act_symbol' as they are your ID columns
columns_to_unpivot = [col for col in df_nvda_pd_feat_spark.columns if col not in ["date", "act_symbol"]]

# Construct the 'stack' expression dynamically
# The format is 'number_of_columns, "col1_name", col1_value, "col2_name", col2_value, ...'
stack_expression = ", ".join([f"'{col}', {col}" for col in columns_to_unpivot])
num_columns = len(columns_to_unpivot)

# Apply the stack function
df_nvda_pd_feat_spark_pivoted = df_nvda_pd_feat_spark.selectExpr(
    "date",
    "act_symbol",
    f"stack({num_columns}, {stack_expression}) as (id, value)"
)

display(df_nvda_pd_feat_spark_pivoted)

# Create Features

In [0]:
# Define query
query = f"select * FROM featlib{catalog_suffix}.components.pricing order by act_symbol, date"

# Read data from table
df = spark.sql(query)

# Convert to Pandas DataFrame
df_pd = df.toPandas()

# def process_symbol_group(group):
#     return add_all_ta_features(
#         group,
#         open="open",
#         high="high",
#         low="low",
#         close="close",
#         volume="volume",
#         fillna=False
#     )

# # Apply the function only to groups with at least 50 rows
# df_pd_feat = (
#     df_pd.groupby("act_symbol", group_keys=False)
#     .apply(process_symbol_group)
#     .reset_index(drop=True)
# )

# Add ta features filling NaN values
# df_pd_feat = add_all_ta_features(df_pd, open="open", high="high", low="low", close="close", volume="volume", fillna=False)\
#     .drop(['open', 'high', 'low', 'close', 'volume'], axis=1)

# df_pd_wr = ta.momentum.WilliamsRIndicator(df_pd, low="low", close="close", fillna=False)\
#     .drop(['open', 'high', 'low', 'close', 'volume'], axis=1)

In [0]:
df_pd

In [0]:
def process_symbol_group(group):
    return ta.momentum.WilliamsRIndicator(
        high=df_pd["high"],
        low=df_pd["low"],
        close=df_pd["close"],
        lbp=14,
        fillna=False
    ).williams_r()

# Apply the function only to groups with at least 50 rows
df_pd_wr = (
    df_pd.groupby("act_symbol", group_keys=False)
    .apply(process_symbol_group)
    .reset_index(drop=True)
)

df_pd_wr

In [0]:
from pyspark.sql.functions import pandas_udf
import pandas as pd
import ta

def process_symbol_group(group: pd.DataFrame) -> pd.DataFrame:
    group['williams_r'] = ta.momentum.WilliamsRIndicator(
        high=group["high"],
        low=group["low"],
        close=group["close"],
        lbp=14,
        fillna=False
    ).williams_r()
    return group[['date', 'act_symbol', 'williams_r']]

df_spark_wr = df.groupby("act_symbol").applyInPandas(
    process_symbol_group,
    schema="date date, act_symbol string, williams_r double"
)
display(df_spark_wr)

In [0]:
from pyspark.sql.functions import pandas_udf
import pandas as pd
import ta

def process_symbol_group(group: pd.DataFrame) -> pd.DataFrame:
    # Ensure the group DataFrame has the correct column names
    group.columns = ["date", "act_symbol", "open", "high", "low", "close", "volume"]
    
    return ta.add_all_ta_features(
        group, open="open", high="high", low="low", close="close", volume="volume", fillna=False
    ).drop(['open', 'high', 'low', 'close', 'volume'], axis=1)

df_spark_ta = df.groupby("act_symbol").applyInPandas(
    process_symbol_group,
    schema="date date, act_symbol string, volume_adi double, volume_obv double, volume_cmf double, volume_fi double, volume_em double, volume_sma_em double, volume_vpt double, volume_vwap double, volume_mfi double, volume_nvi double, volatility_bbm double, volatility_bbh double, volatility_bbl double, volatility_bbw double, volatility_bbp double, volatility_bbhi double, volatility_bbli double, volatility_kcc double, volatility_kch double, volatility_kcl double, volatility_kcw double, volatility_kcp double, volatility_kchi double, volatility_kcli double, volatility_dcl double, volatility_dch double, volatility_dcm double, volatility_dcw double, volatility_dcp double, volatility_atr double, volatility_ui double, trend_macd double, trend_macd_signal double, trend_macd_diff double, trend_sma_fast double, trend_sma_slow double, trend_ema_fast double, trend_ema_slow double, trend_vortex_ind_pos double, trend_vortex_ind_neg double, trend_vortex_ind_diff double, trend_trix double, trend_mass_index double, trend_dpo double, trend_kst double, trend_kst_sig double, trend_kst_diff double, trend_ichimoku_conv double, trend_ichimoku_base double, trend_ichimoku_a double, trend_ichimoku_b double, trend_stc double, trend_adx double, trend_adx_pos double, trend_adx_neg double, trend_cci double, trend_visual_ichimoku_a double, trend_visual_ichimoku_b double, trend_aroon_up double, trend_aroon_down double, trend_aroon_ind double, trend_psar_up double, trend_psar_down double, trend_psar_up_indicator double, trend_psar_down_indicator double, momentum_rsi double, momentum_stoch_rsi double, momentum_stoch_rsi_k double, momentum_stoch_rsi_d double, momentum_tsi double, momentum_uo double, momentum_stoch double, momentum_stoch_signal double, momentum_wr double, momentum_ao double, momentum_roc double, momentum_ppo double, momentum_ppo_signal double, momentum_ppo_hist double, momentum_pvo double, momentum_pvo_signal double, momentum_pvo_hist double, momentum_kama double, others_dr double, others_dlr double, others_cr double"
)
display(df_spark_ta)

In [0]:
df_spark_ta.collect()

In [0]:
df_nvda = df_spark_wr.filter(df_spark_wr["act_symbol"] == "NVDA")

display(df_nvda)

In [0]:
comparison_df = df_nvda.join(df_nvda_pd_feat_spark, on=["date", "act_symbol"], how="inner")

display(comparison_df)

In [0]:
df_spark_ta.write.saveAsTable("featlib_dev.features.ta_raw")

In [0]:
df_pd_wr

In [0]:
# Convert to Spark DataFrame
df_feat = spark.createDataFrame(df_pd_feat)

# Display
df_feat.display()

# Unpivot

In [0]:
# Get a list of the columns you want to unpivot
# Exclude 'date' and 'act_symbol' as they are ID columns
columns_to_unpivot = [col for col in df_feat.columns if col not in ["date", "act_symbol"]]

# Construct the 'stack' expression dynamically
# The format is 'number_of_columns, "col1_name", col1_value, "col2_name", col2_value, ...'
stack_expression = ", ".join([f"'{col}', {col}" for col in columns_to_unpivot])
num_columns = len(columns_to_unpivot)

# Apply the stack function
df_feat_unpivot = df_feat.selectExpr(
    "date",
    "act_symbol",
    f"stack({num_columns}, {stack_expression}) as (id, value)"
)

# Display
df_feat_unpivot.display()

In [0]:
# Save df_feat_unpivot to the specified table
df_feat_unpivot.write.mode("overwrite").saveAsTable(f"featlib{catalog_suffix}.features.ta_raw")