In [1]:
import polars as pl
import numpy as np
import pandas as pd
from mars.analysis.profiler import MarsDataProfiler
from mars.utils.logger import set_log_level

# ËÆæÁΩÆÊó•ÂøóÁ∫ßÂà´
set_log_level("INFO")

def get_demo_data():
    """ÁîüÊàê‰∏Ä‰∏™ÂåÖÂê´ÂêÑÁßç‚ÄòËÑè‚ÄôÊï∞ÊçÆÁöÑ Demo ÈõÜÔºåÁî®‰∫éÈ™åËØÅÁ∫¢ÁªøÁÅØÊä•Ë≠¶ÈÄªËæë"""
    np.random.seed(1234)
    rows = 10000
    
    # 1. ÁîüÊàêÂü∫Á°ÄÊï∞ÊçÆ
    data = {
        "month": np.random.choice(["202401", "202402", "202403", "202404"], size=rows),
        "good_feature": np.random.normal(100, 10, size=rows),  # ÂÆåÁæéÁâπÂæÅ
    }
    
    # 2. Ê≥®ÂÖ•ËÑèÊï∞ÊçÆ (ÊµãËØï DQ Êä•Ë≠¶)
    
    # [Case A] È´òÁº∫Â§± (Missing Rate > 50%) -> show_missing Â∫îÂÖ®Á∫¢
    bad_missing = np.random.normal(0, 1, size=rows)
    bad_missing[np.random.rand(rows) < 0.6] = -999 
    data["feat_high_missing"] = bad_missing
    
    # [Case B] ÂáÜÂ∏∏Èáè (Top1 Ratio > 99%) -> show_top1 Â∫îÂÖ®Á∫¢ [NEW]
    # Ê®°ÊãüÔºö99.9% ÁöÑ‰∫∫ÈÉΩÊòØ 0ÔºåÂè™Êúâ 0.1% ÊòØ 1
    quasi = np.zeros(rows)
    quasi[np.random.rand(rows) < 0.001] = 1
    data["feat_quasi_constant"] = quasi

    # [Case C] È´òÂü∫Êï∞ (Unique Rate = 100%) -> show_unique Â∫îÂÖ®Ëìù [NEW]
    # Ê®°ÊãüÔºöÁî®Êà∑ ID
    data["feat_user_id"] = [f"user_{i}" for i in range(rows)]

    # 3. Ê≥®ÂÖ•‰∏çÁ®≥ÂÆöÁâπÂæÅ (ÊµãËØï Stats Êä•Ë≠¶)

    # [Case D] ÂâßÁÉàÊäñÂä® (Stability/CV ÁàÜÁÇ∏) -> show_stats Â∫îÊúâÁ∫¢Ëâ≤ÈïøÊù°
    # Ê®°ÊãüÔºöÂâç‰∏â‰∏™ÊúàÂùáÂÄº100ÔºåÁ¨¨Âõõ‰∏™ÊúàÂùáÂÄºÁ™ÅÂèòÊàê 1000
    unstable = np.random.normal(100, 5, size=rows)
    mask_apr = data["month"] == "202404"
    unstable[mask_apr] = unstable[mask_apr] * 10 
    data["feat_unstable"] = unstable

    return pl.DataFrame(data)

# ÁîüÊàêÊï∞ÊçÆ
df = get_demo_data()
print(f"‚úÖ Demo Data Ready: {df.shape}")
print("ÂåÖÂê´ÊµãËØïÂú∫ÊôØ: Ê≠£Â∏∏, È´òÁº∫Â§±, ÂáÜÂ∏∏Èáè(Top1), È´òÂü∫Êï∞(ID), ÂâßÁÉàÊäñÂä®(Unstable)")
df.head()

‚úÖ Demo Data Ready: (10000, 6)
ÂåÖÂê´ÊµãËØïÂú∫ÊôØ: Ê≠£Â∏∏, È´òÁº∫Â§±, ÂáÜÂ∏∏Èáè(Top1), È´òÂü∫Êï∞(ID), ÂâßÁÉàÊäñÂä®(Unstable)


month,good_feature,feat_high_missing,feat_quasi_constant,feat_user_id,feat_unstable
str,f64,f64,f64,str,f64
"""202404""",95.73776,-0.051205,0.0,"""user_0""",950.957766
"""202404""",109.295043,-999.0,0.0,"""user_1""",983.240373
"""202403""",104.267144,-0.809159,0.0,"""user_2""",101.073494
"""202402""",91.648176,0.277759,0.0,"""user_3""",103.256861
"""202401""",110.037528,1.419918,0.0,"""user_4""",98.321816


In [2]:
# ÂàùÂßãÂåñÂàÜÊûêÂô® (ÂÆö‰πâ -999 ‰∏∫Áº∫Â§±ÂÄº)
profiler = MarsDataProfiler(df, custom_missing_values=[-999])

# ÊâßË°åÂàÜÁªÑÈÄèËßÜ
# Êàë‰ª¨ÈáçÁÇπÂÖ≥Ê≥® 'mean' (ÂùáÂÄºÊºÇÁßª) Âíå 'max' (ÊûÅÂÄºË∑≥Âèò)
report = profiler.generate_profile(profile_by="month") # 

[32m[MARS] 2026-01-10 00:50:18 - INFO - Starting profiling (Group: month, Sparkline: True)...[0m
[32m[MARS] 2026-01-10 00:50:18 - INFO - Profile generated successfully.[0m
[32m[MARS] 2026-01-10 00:50:18 - INFO - ‚è±Ô∏è [MarsDataProfiler.generate_profile] finished in 0.0208s[0m


In [3]:
report

In [4]:
report.show_overview()

feature,dtype,distribution,missing_rate,zeros_rate,unique_rate,top1_ratio,mean,std,min,max,median,p25,p75
feat_high_missing,Float64,__‚ñÉ‚ñÜ‚ñà‚ñÖ‚ñÇ_,59.77%,0.00%,40.24%,59.77%,-597.09,489.91,-999.0,3.28,-999.0,-999.0,-0.28
feat_quasi_constant,Float64,‚ñà_______,0.00%,99.88%,0.02%,99.88%,0.0,0.03,0.0,1.0,0.0,0.0,0.0
feat_unstable,Float64,‚ñà_____‚ñÇ_,0.00%,0.00%,100.00%,0.01%,327.97,392.52,81.62,1166.73,102.13,97.88,887.05
good_feature,Float64,__‚ñÉ‚ñá‚ñà‚ñÑ__,0.00%,0.00%,100.00%,0.01%,100.08,10.01,61.41,136.67,100.12,93.34,106.85
feat_user_id,String,,0.00%,0.00%,100.00%,0.01%,,,,,,,
month,String,,0.00%,0.00%,0.04%,25.31%,,,,,,,


In [5]:
report.show_dq("missing")

feature,dtype,202401,202402,202403,202404,total
feat_high_missing,Float64,60.67%,58.39%,60.43%,59.62%,59.77%
feat_quasi_constant,Float64,0.00%,0.00%,0.00%,0.00%,0.00%
feat_unstable,Float64,0.00%,0.00%,0.00%,0.00%,0.00%
good_feature,Float64,0.00%,0.00%,0.00%,0.00%,0.00%
feat_user_id,String,0.00%,0.00%,0.00%,0.00%,0.00%


In [6]:
report.show_dq("zeros")

feature,dtype,202401,202402,202403,202404,total
feat_high_missing,Float64,0.00%,0.00%,0.00%,0.00%,0.00%
feat_quasi_constant,Float64,99.96%,99.92%,99.76%,99.88%,99.88%
feat_unstable,Float64,0.00%,0.00%,0.00%,0.00%,0.00%
good_feature,Float64,0.00%,0.00%,0.00%,0.00%,0.00%
feat_user_id,String,0.00%,0.00%,0.00%,0.00%,0.00%


In [7]:
report.show_dq("unique")

feature,dtype,202401,202402,202403,202404,total
feat_high_missing,Float64,39.37%,41.65%,39.61%,40.42%,40.24%
feat_quasi_constant,Float64,0.08%,0.08%,0.08%,0.08%,0.02%
feat_unstable,Float64,100.00%,100.00%,100.00%,100.00%,100.00%
good_feature,Float64,100.00%,100.00%,100.00%,100.00%,100.00%
feat_user_id,String,100.00%,100.00%,100.00%,100.00%,100.00%


In [8]:
report.show_dq("top1")

feature,dtype,202401,202402,202403,202404,total
feat_high_missing,Float64,60.67%,58.39%,60.43%,59.62%,59.77%
feat_quasi_constant,Float64,99.96%,99.92%,99.76%,99.88%,99.88%
feat_unstable,Float64,0.04%,0.04%,0.04%,0.04%,0.01%
good_feature,Float64,0.04%,0.04%,0.04%,0.04%,0.01%
feat_user_id,String,0.04%,0.04%,0.04%,0.04%,0.01%


In [9]:
report.show_trend("mean")

feature,dtype,202401,202402,202403,202404,total,group_var,group_cv
feat_high_missing,Float64,-606.08,-583.33,-603.73,-595.59,-597.09,105.5214,0.0172
feat_quasi_constant,Float64,0.0,0.0,0.0,0.0,0.0,0.0,0.7247
feat_unstable,Float64,100.01,99.98,99.97,1000.76,327.97,202850.0319,1.385
good_feature,Float64,100.05,100.26,99.71,100.27,100.08,0.0691,0.0026
feat_user_id,String,,,,,,0.0,0.0


In [10]:
report.show_trend("max")

feature,dtype,202401,202402,202403,202404,total,group_var,group_cv
feat_high_missing,Float64,2.9,3.28,2.97,2.77,3.28,0.0479,0.0735
feat_quasi_constant,Float64,1.0,1.0,1.0,1.0,1.0,0.0,0.0
feat_unstable,Float64,117.74,118.94,116.04,1166.73,1166.73,275184.99,1.381
good_feature,Float64,132.88,136.67,130.01,129.28,136.67,11.2464,0.0254
feat_user_id,String,,,,,,0.0,0.0


In [11]:
report.show_trend("min")

feature,dtype,202401,202402,202403,202404,total,group_var,group_cv
feat_high_missing,Float64,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,0.0
feat_quasi_constant,Float64,0.0,0.0,0.0,0.0,0.0,0.0,0.0
feat_unstable,Float64,81.62,83.3,82.32,802.13,81.62,129499.179,1.3717
good_feature,Float64,64.01,61.41,65.65,61.82,61.41,3.9172,0.0313
feat_user_id,String,,,,,,0.0,0.0


In [12]:
report.write_excel("demo_report.xlsx")

[32m[MARS] 2026-01-10 00:50:18 - INFO - üìä Exporting to demo_report.xlsx...[0m
[32m[MARS] 2026-01-10 00:50:18 - INFO - ‚úÖ Done.[0m


In [13]:
report.get_profile_data()

(shape: (6, 14)
 ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
 ‚îÇ feature    ‚îÜ dtype   ‚îÜ distribut ‚îÜ missing_r ‚îÜ ‚Ä¶ ‚îÜ max       ‚îÜ median    ‚îÜ p25       ‚îÜ p75       ‚îÇ
 ‚îÇ ---        ‚îÜ ---     ‚îÜ ion       ‚îÜ ate       ‚îÜ   ‚îÜ ---       ‚îÜ ---       ‚îÜ ---       ‚îÜ ---       ‚îÇ
 ‚îÇ str        ‚îÜ str     ‚îÜ ---       ‚îÜ ---       ‚îÜ   ‚îÜ f64       ‚îÜ f64       ‚îÜ f64       ‚îÜ f64       ‚îÇ
 ‚îÇ            ‚îÜ         ‚îÜ str       ‚îÜ f64       ‚îÜ   ‚îÜ           ‚îÜ           ‚îÜ           ‚îÜ           ‚îÇ
 ‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï

In [14]:
# ‰º™‰ª£Á†ÅÈÄªËæë
def filter_bad_features(df):
    
    # 1. Âà†Êéâ‚ÄúÂ§™Êó†ËÅä‚ÄùÁöÑÂàó (ÂáÜÂ∏∏Èáè)
    # Âè™Ë¶Å Top1 Âç†ÊØîÂ§™È´òÔºå‰∏çÁÆ°ÂÆÉÊòØÂï•Á±ªÂûãÔºåÁõ¥Êé•Âà†
    top1_ratio = 0.96
    if top1_ratio > 0.95: 
        return "DROP (Quasi-Constant)"

    # 2. Âà†Êéâ‚ÄúÂ§™ÊùÇ‰π±‚ÄùÁöÑ ID Âàó (È´òÂü∫Êï∞)
    # Âè™ÊúâÂΩìÂÆÉÊòØÂ≠óÁ¨¶‰∏≤Á±ªÂûãÔºå‰∏îÂá†‰πéÈÉΩ‰∏çÈáçÂ§çÊó∂ÔºåÊâçÂà†
    # (Êï∞ÂÄºÂûãÁöÑÈ´òUniqueÊòØÂ•Ω‰∫ãÔºå‰∏çËÉΩÂà†ÔºÅ)
    unique_rate = 1.0
    is_string_type = True
    if unique_rate > 0.95 and is_string_type:
        return "DROP (ID Column)"
        
    return "KEEP"