## 1. Import

In [58]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
import plotly.express as px
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

In [59]:
# 페어쌍으로 묶인 아이템 시각화 비교 함수
def visualize_(df, pairs, ref_item, target_col=None):
    if target_col is None:
        target_col = np.append(["ym",ref_item],pairs[pairs["leading_item_id"] == ref_item]["following_item_id"].values)
    uss_df = df[target_col].copy()
    uss_long = uss_df.melt(id_vars="ym", var_name="item", value_name="value")
    bench = (
        uss_long.loc[uss_long["item"] == ref_item, ["ym", "value"]]
                .rename(columns={"value": "bench_value"})
    )
    dfc = uss_long.merge(bench, on="ym", how="left")
    plot_df = pd.concat(
        [
            dfc.assign(series="self",  val=dfc["value"]),
            dfc.assign(series=f"benchmark: {ref_item}", val=dfc["bench_value"])
        ],
        ignore_index=True
    )
    fig = px.line(
        plot_df,
        x="ym",
        y="val",
        color="series",
        facet_col="item",
        facet_col_wrap=6,
        facet_row_spacing=0.02,
        color_discrete_map={
            "self": "#2962FF",                 # 각 아이템(자기 자신)
            f"benchmark: {ref_item}": "#9E9E9E"  # 기준 라인(회색)
        }
    )

    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]),)
    fig.update_traces(opacity=0.95)
    # fig.update_layout(width=1800, height=1400, showlegend=True)
    fig.update_layout(showlegend=True,)

    fig.show()
    
# 단순 페어쌍 갯수 변화 함수
def count_p(pairs1,pairs2):
    count = {}
    plus = 0
    neg = 0
    pairs = np.union1d(pairs1["leading_item_id"].unique(), pairs2["leading_item_id"].unique())
    for i in pairs:
        target_col = np.append(["ym",i],pairs1[pairs1["leading_item_id"] == i]["following_item_id"].values)
        target_col_sub = np.append(["ym",i],pairs2[pairs2["leading_item_id"] == i]["following_item_id"].values)
        temp_inter = np.intersect1d(target_col,target_col_sub,)
        temp_tar = np.setdiff1d(target_col,target_col_sub,)
        temp_tar = np.append(["ym",i],temp_tar)
        temp_tar_sub = np.setdiff1d(target_col_sub,target_col,)
        temp_tar_sub = np.append(["ym",i],temp_tar_sub)
        count[i] = [len(target_col)-2,len(target_col_sub)-2, len(temp_inter)-2, len(temp_tar)-2,len(temp_tar_sub)-2]
        plus += len(temp_tar)-2
        neg += len(temp_tar_sub)-2
    return count, plus, neg, plus - neg

## 2. 데이터 전처리

In [60]:
train = pd.read_csv('./train.csv')

# year, month, item_id 기준으로 value 합산 (seq만 다르다면 value 합산)
monthly = (
    train
    .groupby(["item_id", "year", "month"], as_index=False)
    .sum()
)

# year, month를 하나의 키(ym)로 묶기
monthly["ym"] = pd.to_datetime(
    monthly["year"].astype(str) + "-" + monthly["month"].astype(str).str.zfill(2)
)
monthly["ym"] = monthly["ym"].dt.strftime("%Y-%m")

# item_id × ym 피벗 (월별 총 무역량 매트릭스 생성)
pivot = (
    monthly
    .pivot(index="item_id", columns="ym", values="value")
    .fillna(0.0)
)

pivot.head()

ym,2022-01,2022-02,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,...,2024-10,2024-11,2024-12,2025-01,2025-02,2025-03,2025-04,2025-05,2025-06,2025-07
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AANGBULD,14276.0,52347.0,53549.0,0.0,26997.0,84489.0,0.0,0.0,0.0,0.0,...,428725.0,144248.0,26507.0,25691.0,25805.0,0.0,38441.0,0.0,441275.0,533478.0
AHMDUILJ,242705.0,120847.0,197317.0,126142.0,71730.0,149138.0,186617.0,169995.0,140547.0,89292.0,...,123085.0,143451.0,78649.0,125098.0,80404.0,157401.0,115509.0,127473.0,89479.0,101317.0
ANWUJOKX,0.0,0.0,0.0,63580.0,81670.0,26424.0,8470.0,0.0,0.0,80475.0,...,0.0,0.0,0.0,27980.0,0.0,0.0,0.0,0.0,0.0,0.0
APQGTRMF,383999.0,512813.0,217064.0,470398.0,539873.0,582317.0,759980.0,216019.0,537693.0,205326.0,...,683581.0,2147.0,0.0,25013.0,77.0,20741.0,2403.0,3543.0,32430.0,40608.0
ATLDMDBO,143097177.0,103568323.0,118403737.0,121873741.0,115024617.0,65716075.0,146216818.0,97552978.0,72341427.0,87454167.0,...,60276050.0,30160198.0,42613728.0,64451013.0,38667429.0,29354408.0,42450439.0,37136720.0,32181798.0,57090235.0


In [61]:
# year, month, item_id 기준으로 value 합산 (seq만 다르다면 value 합산)
monthly_w = (
    train
    .groupby(["item_id", "year", "month"], as_index=False)
    .sum()
)

# year, month를 하나의 키(ym)로 묶기
monthly_w["ym"] = pd.to_datetime(
    monthly["year"].astype(str) + "-" + monthly_w["month"].astype(str).str.zfill(2)
)
monthly_w["ym"] = monthly_w["ym"].dt.strftime("%Y-%m")

# item_id × ym 피벗 (월별 총 무역량 매트릭스 생성)
pivot_w = (
    monthly_w
    .pivot(index="item_id", columns="ym", values="weight")
    .fillna(0.0)
)

pivot_w.head()

ym,2022-01,2022-02,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,...,2024-10,2024-11,2024-12,2025-01,2025-02,2025-03,2025-04,2025-05,2025-06,2025-07
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AANGBULD,17625.0,67983.0,69544.0,0.0,34173.0,103666.0,0.0,0.0,0.0,0.0,...,786651.0,249144.0,33133.0,32937.0,33083.0,0.0,49050.0,0.0,865246.0,1046036.0
AHMDUILJ,100990.0,43444.0,64113.0,42637.0,21468.0,59424.0,61587.0,63625.0,61245.0,20382.0,...,42986.0,43763.0,24379.0,62351.0,23521.0,43332.0,44913.0,44035.0,25574.0,34463.0
ANWUJOKX,0.0,0.0,0.0,89967.0,118992.0,41649.0,13888.0,0.0,0.0,119940.0,...,0.0,0.0,0.0,37211.0,0.0,0.0,0.0,0.0,0.0,0.0
APQGTRMF,50193.0,81429.0,43310.0,62505.0,84680.0,37425.0,114600.0,39305.0,104865.0,43123.0,...,118952.0,698.0,0.0,1907.0,11.0,2777.0,347.0,335.0,4974.0,6314.0
ATLDMDBO,163308448.0,113468029.0,131798388.0,118641599.0,106301802.0,63769133.0,148292927.0,101468186.0,77986006.0,94320028.0,...,143545801.0,70368609.0,99495350.0,153804927.0,93762902.0,76888377.0,119375444.0,112349280.0,95457203.0,165713328.0


In [380]:
train

Unnamed: 0,item_id,year,month,seq,type,hs4,weight,quantity,value,hs4_tag
0,DEWLVASR,2022,1,1.0,1,3038,14858.0,0.0,32688.0,분류 불가
1,ELQGMQWE,2022,1,1.0,1,2002,62195.0,0.0,110617.0,조제ㆍ보존처리한 토마토
2,AHMDUILJ,2022,1,1.0,1,2102,18426.0,0.0,72766.0,효모ㆍ단세포 미생물(죽은 것 한정)ㆍ조제한 베이킹 파우더
3,XIPPENFQ,2022,1,1.0,1,2501,20426.0,0.0,11172.0,소금ㆍ순염화나트륨ㆍ바닷물
4,FTSVTTSR,2022,1,1.0,1,2529,248000.0,0.0,143004.0,장석ㆍ백류석ㆍ하석ㆍ하석 섬장암ㆍ형석
...,...,...,...,...,...,...,...,...,...,...
10831,XIFHSOWQ,2025,7,3.0,1,8708,352.0,0.0,12937.0,부분품ㆍ부속품(제8701호부터 제8705호까지 차량용으로 한정한다)
10832,FITUEHWN,2025,7,3.0,1,8714,655.0,900.0,16054.0,부분품ㆍ부속품(제8711호부터 제8713호까지 차량용으로 한정한다)
10833,UGEQLMXM,2025,7,3.0,1,8714,758.0,0.0,74377.0,부분품ㆍ부속품(제8711호부터 제8713호까지 차량용으로 한정한다)
10834,BLANHGYY,2025,7,3.0,1,9022,345.0,2.0,69720.0,엑스선ㆍ알파선ㆍ베타선ㆍ감마선ㆍ그 밖의 전리선을 사용하는 기기(내과용ㆍ외과용ㆍ치과용ㆍ...


In [62]:
hs4 = pd.read_csv('./hs4_c.csv')
# hs4 = hs4.drop("2024수출액(달러)", axis=1)
hs4.columns = ["index", 'hs4', 'tag']

In [63]:
hs4

Unnamed: 0,index,hs4,tag
0,0,3038,분류 불가
1,1,2002,조제ㆍ보존처리한 토마토
2,2,2102,효모ㆍ단세포 미생물(죽은 것 한정)ㆍ조제한 베이킹 파우더
3,3,2501,소금ㆍ순염화나트륨ㆍ바닷물
4,4,2529,장석ㆍ백류석ㆍ하석ㆍ하석 섬장암ㆍ형석
...,...,...,...
66,1095,3003,의약품(두 가지 이상 성분을 혼합한 치료용ㆍ예방용으로 제3002호ㆍ제3005호ㆍ제3...
67,1584,2612,분류 불가
68,1963,4302,"모피(유연처리ㆍ드레스가공한 것으로, 머리 부분ㆍ꼬리 부분ㆍ발 부분ㆍ그 밖의 조각ㆍ절..."
69,2656,7142,분류 불가


In [64]:
result = []
for i in train["hs4"]:
    temp = hs4[i == hs4["hs4"]]
    if not temp.empty:
        result.append(temp["tag"].values[0])
    else:
        result.append("분류 불가")

In [65]:
train["hs4_tag"] = result

In [66]:
train

Unnamed: 0,item_id,year,month,seq,type,hs4,weight,quantity,value,hs4_tag
0,DEWLVASR,2022,1,1.0,1,3038,14858.0,0.0,32688.0,분류 불가
1,ELQGMQWE,2022,1,1.0,1,2002,62195.0,0.0,110617.0,조제ㆍ보존처리한 토마토
2,AHMDUILJ,2022,1,1.0,1,2102,18426.0,0.0,72766.0,효모ㆍ단세포 미생물(죽은 것 한정)ㆍ조제한 베이킹 파우더
3,XIPPENFQ,2022,1,1.0,1,2501,20426.0,0.0,11172.0,소금ㆍ순염화나트륨ㆍ바닷물
4,FTSVTTSR,2022,1,1.0,1,2529,248000.0,0.0,143004.0,장석ㆍ백류석ㆍ하석ㆍ하석 섬장암ㆍ형석
...,...,...,...,...,...,...,...,...,...,...
10831,XIFHSOWQ,2025,7,3.0,1,8708,352.0,0.0,12937.0,부분품ㆍ부속품(제8701호부터 제8705호까지 차량용으로 한정한다)
10832,FITUEHWN,2025,7,3.0,1,8714,655.0,900.0,16054.0,부분품ㆍ부속품(제8711호부터 제8713호까지 차량용으로 한정한다)
10833,UGEQLMXM,2025,7,3.0,1,8714,758.0,0.0,74377.0,부분품ㆍ부속품(제8711호부터 제8713호까지 차량용으로 한정한다)
10834,BLANHGYY,2025,7,3.0,1,9022,345.0,2.0,69720.0,엑스선ㆍ알파선ㆍ베타선ㆍ감마선ㆍ그 밖의 전리선을 사용하는 기기(내과용ㆍ외과용ㆍ치과용ㆍ...


In [67]:
train

Unnamed: 0,item_id,year,month,seq,type,hs4,weight,quantity,value,hs4_tag
0,DEWLVASR,2022,1,1.0,1,3038,14858.0,0.0,32688.0,분류 불가
1,ELQGMQWE,2022,1,1.0,1,2002,62195.0,0.0,110617.0,조제ㆍ보존처리한 토마토
2,AHMDUILJ,2022,1,1.0,1,2102,18426.0,0.0,72766.0,효모ㆍ단세포 미생물(죽은 것 한정)ㆍ조제한 베이킹 파우더
3,XIPPENFQ,2022,1,1.0,1,2501,20426.0,0.0,11172.0,소금ㆍ순염화나트륨ㆍ바닷물
4,FTSVTTSR,2022,1,1.0,1,2529,248000.0,0.0,143004.0,장석ㆍ백류석ㆍ하석ㆍ하석 섬장암ㆍ형석
...,...,...,...,...,...,...,...,...,...,...
10831,XIFHSOWQ,2025,7,3.0,1,8708,352.0,0.0,12937.0,부분품ㆍ부속품(제8701호부터 제8705호까지 차량용으로 한정한다)
10832,FITUEHWN,2025,7,3.0,1,8714,655.0,900.0,16054.0,부분품ㆍ부속품(제8711호부터 제8713호까지 차량용으로 한정한다)
10833,UGEQLMXM,2025,7,3.0,1,8714,758.0,0.0,74377.0,부분품ㆍ부속품(제8711호부터 제8713호까지 차량용으로 한정한다)
10834,BLANHGYY,2025,7,3.0,1,9022,345.0,2.0,69720.0,엑스선ㆍ알파선ㆍ베타선ㆍ감마선ㆍ그 밖의 전리선을 사용하는 기기(내과용ㆍ외과용ㆍ치과용ㆍ...


In [68]:
train.iloc[train["item_id"].drop_duplicates().index]["hs4_tag"].unique()

array(['분류 불가', '조제ㆍ보존처리한 토마토', '효모ㆍ단세포 미생물(죽은 것 한정)ㆍ조제한 베이킹 파우더',
       '소금ㆍ순염화나트륨ㆍ바닷물', '장석ㆍ백류석ㆍ하석ㆍ하석 섬장암ㆍ형석',
       '알칼리금속ㆍ알칼리토류금속ㆍ희토류금속ㆍ스칸듐ㆍ이트륨ㆍ수은', '황산ㆍ발연황산', '그 밖의 무기산ㆍ무기 비금속 산화물',
       '무수암모니아ㆍ암모니아수',
       '히드라진ㆍ히드록실아민, 이들의 무기염ㆍ그 밖의 무기염기ㆍ금속산화물ㆍ금속수산화물ㆍ금속과산화물',
       '황산염ㆍ명반ㆍ과산화황산염', '탄산염ㆍ과산화탄산염ㆍ탄산암모늄(카르밤산암모늄 함유 한정)',
       '산화금속산염ㆍ과산화금속산염', '희토류금속ㆍ이트륨ㆍ스칸듐ㆍ금속혼합물의 무기ㆍ유기 화합물', '과산화수소',
       '불포화비환식모노카르복시산ㆍ환식모노카르복시산ㆍ할로겐화물ㆍ과산화물ㆍ과산화산ㆍ할로겐화유도체ㆍ술폰화유도체ㆍ니트로화유도체ㆍ니트로소화유도체',
       '의료용품(제4호 물품 한정)', '질소비료(광물성 비료ㆍ화학비료 한정)',
       '착색제ㆍ조제품(제3호 물품 한정)ㆍ루미노퍼로 사용되는 무기물',
       '조제 안료ㆍ조제 유백제ㆍ조제 그림물감ㆍ법랑ㆍ유약ㆍ유약용 슬립ㆍ액체 상태 러스터ㆍ유리 프리트(frit)ㆍ유리[가루ㆍ알갱이ㆍ플레이크 모양 한정]',
       '인쇄용 잉크ㆍ필기용 잉크ㆍ제도용 잉크ㆍ그 밖의 잉크',
       '면도용 제품류ㆍ인체용 탈취제ㆍ목욕용 조제품ㆍ탈모제ㆍ조제향료ㆍ분류되지 않은 화장품, 화장용품ㆍ실내용 조제 탈취제',
       '인조 왁스ㆍ조제 왁스', '로진ㆍ수지산ㆍ로진 스피릿ㆍ로진유ㆍ런검', '반응시작제ㆍ반응촉진제ㆍ촉매 조제품',
       '조제 점결제(주물 주형용, 코어용 한정)ㆍ분류되지 않은 화학품, 화학공업, 연관공업에 따른 조제품(천연물만의 혼합물 포함)',
       '염화비닐 중합체ㆍ할로겐화 올레핀 중합체[일차제품 한정]', '아미노수지ㆍ페놀수지ㆍ폴리우레탄[일차제품 한정]',
       

In [69]:
same_tag = []
for i in train.iloc[train["item_id"].drop_duplicates().index]["hs4_tag"].unique():
    same_tag.append(pivot.reset_index()[pd.merge(pivot,train.iloc[train["item_id"].drop_duplicates().index][["item_id","hs4_tag"]],on="item_id")["hs4_tag"] == i].set_index("item_id").T.columns.to_list())
same_tag

[['DEWLVASR', 'GIKPEWTY', 'JSLXRQOK', 'PLMZALFA', 'RUVXNNVA'],
 ['ELQGMQWE'],
 ['AHMDUILJ'],
 ['XIPPENFQ'],
 ['FTSVTTSR', 'XMKRPGLB'],
 ['BSRMSVTC', 'DJBLNPNC', 'RCBZUSIM', 'SUOYXCHP', 'WQMVCOEM', 'ZKENOUDA'],
 ['DDEXPPXU', 'LLHREMKS', 'YSYHGLQK'],
 ['DNMPSKTB', 'RJGPVEXX', 'VWMBASNE', 'WPQXWHYO'],
 ['ATLDMDBO'],
 ['SAHWCZNH'],
 ['LUENUFGA'],
 ['HXYSSRXE'],
 ['QRKRBYJL'],
 ['FRHNWLNI', 'MBSBZBXA'],
 ['BJALXPFS'],
 ['RAWUKQMJ'],
 ['ZGJXVMNI'],
 ['BTMOEMEP', 'LRVGFDFM'],
 ['UXSPKBJR'],
 ['BEZYMBBT'],
 ['SAAYMURU'],
 ['QJQJSWFU'],
 ['EVBVXETX'],
 ['DUCMGGNW'],
 ['OJIFIHMZ', 'STZDBITS', 'XIIEJNEE'],
 ['FCYBOAXC', 'IGDVVKUD', 'LPHPPJUG', 'PYZMVUWD', 'SNHYOVBM'],
 ['JPBRUTWP'],
 ['LTOYKIML'],
 ['QVLMOEYE'],
 ['JERHKLYW'],
 ['ZXERAXWP'],
 ['LSOIUSXD'],
 ['XUOIQPFL'],
 ['WBLJNPZQ'],
 ['OXKURKXR'],
 ['AXULOHBQ'],
 ['DBWLZWNK'],
 ['NZKBIBNU'],
 ['OKMBFVKS'],
 ['VUAFAIYJ'],
 ['CCLHWFWF'],
 ['OGAFEHLU'],
 ['SDWAYPIK', 'TGOELCAG'],
 ['FQCLOEXA', 'KEUWZRKO', 'UIFPPCLR'],
 ['APQGTRMF'],
 ['ZCELVYQU']

In [70]:
asdqqqqqq = []
for i in same_tag:
    if len(i) > 1:
        asdqqqqqq.append(i)

In [71]:
asdqqqqqq

[['DEWLVASR', 'GIKPEWTY', 'JSLXRQOK', 'PLMZALFA', 'RUVXNNVA'],
 ['FTSVTTSR', 'XMKRPGLB'],
 ['BSRMSVTC', 'DJBLNPNC', 'RCBZUSIM', 'SUOYXCHP', 'WQMVCOEM', 'ZKENOUDA'],
 ['DDEXPPXU', 'LLHREMKS', 'YSYHGLQK'],
 ['DNMPSKTB', 'RJGPVEXX', 'VWMBASNE', 'WPQXWHYO'],
 ['FRHNWLNI', 'MBSBZBXA'],
 ['BTMOEMEP', 'LRVGFDFM'],
 ['OJIFIHMZ', 'STZDBITS', 'XIIEJNEE'],
 ['FCYBOAXC', 'IGDVVKUD', 'LPHPPJUG', 'PYZMVUWD', 'SNHYOVBM'],
 ['SDWAYPIK', 'TGOELCAG'],
 ['FQCLOEXA', 'KEUWZRKO', 'UIFPPCLR'],
 ['GKQIJYDH', 'KFQSHBNH', 'QKXNTIIB', 'RJCAXSGH', 'UQYUIVVR'],
 ['GYHKIVQT', 'ROACSLMG', 'VBYCLTYZ'],
 ['FITUEHWN', 'UGEQLMXM']]

In [72]:
corr_t = pivot.T.corr()
sorted_corr = {}
for i in pivot.T.columns:
    sorted_corr[i]=corr_t[i][abs(corr_t[i]).sort_values(ascending=False) > 0.3]

In [73]:
sorted_corr["ANWUJOKX"]

item_id
ANWUJOKX    1.000000
ATLDMDBO    0.367382
BEZYMBBT    0.378534
BLANHGYY    0.410048
DBWLZWNK    0.438321
GMBFCMIU    0.557591
GYHKIVQT    0.448187
OXKURKXR    0.310884
QRKRBYJL    0.417614
QVLMOEYE    0.562219
ROACSLMG    0.313201
UXSPKBJR    0.483344
VBYCLTYZ    0.485040
ZKENOUDA   -0.306396
Name: ANWUJOKX, dtype: float64

In [381]:
sorted_corr

{'AANGBULD': item_id
 AANGBULD    1.000000
 BEZYMBBT   -0.415233
 DNMPSKTB   -0.337268
 ELQGMQWE    0.350798
 FQCLOEXA    0.400055
 GYHKIVQT   -0.452429
 HXYSSRXE   -0.308311
 JSLXRQOK    0.382309
 KFQSHBNH    0.376338
 NAQIHUKZ    0.348927
 RJGPVEXX    0.425384
 UQYUIVVR    0.420462
 VMAQSTJE   -0.326790
 VUAFAIYJ    0.525307
 WPQXWHYO    0.325317
 XUOIQPFL   -0.358182
 ZGJXVMNI    0.344713
 ZKENOUDA    0.434656
 ZXERAXWP    0.338637
 Name: AANGBULD, dtype: float64,
 'AHMDUILJ': item_id
 AHMDUILJ    1.000000
 BTMOEMEP    0.444717
 IGDVVKUD    0.326140
 LTOYKIML   -0.367153
 XUOIQPFL    0.332079
 Name: AHMDUILJ, dtype: float64,
 'ANWUJOKX': item_id
 ANWUJOKX    1.000000
 ATLDMDBO    0.367382
 BEZYMBBT    0.378534
 BLANHGYY    0.410048
 DBWLZWNK    0.438321
 GMBFCMIU    0.557591
 GYHKIVQT    0.448187
 OXKURKXR    0.310884
 QRKRBYJL    0.417614
 QVLMOEYE    0.562219
 ROACSLMG    0.313201
 UXSPKBJR    0.483344
 VBYCLTYZ    0.485040
 ZKENOUDA   -0.306396
 Name: ANWUJOKX, dtype: float64,
 '

In [75]:
import plotly.express as px

# train의 앞 5개 열만 사용하여 이름 지정
# df = pivot.T.reset_index().iloc[:, 0:5].copy()
df = df_minmax[["ym"]+asdqqqqqq[13]]
# df.columns = ["ym", "AANGBULD", "AHMDUILJ", "ANWUJOKX", "APQGTRMF"]  # R의 structure(names=...)와 동일

# pivot_longer() → melt()
lme_long = df.melt(id_vars="ym", var_name="Metal", value_name="value")

# ggplot(line) → plotly line
fig = px.line(
    lme_long,
    x="ym",
    y="value",
    color="Metal",
    # title="LME : Closing Price of Industrial Metals",
    labels={"date_id": "Date (date_id)", "value": ""}
)

# R의 figsize(16, 6) 비슷하게 크기 조정 (픽셀 단위)
fig.update_layout(width=1100, height=400, legend_title_text="Metal", margin=dict(l=40, r=20, t=60, b=40))

fig.show()

NameError: name 'df_minmax' is not defined

In [None]:
# min_ = 1
# max_ = 0
# for i in monthly["item_id"].unique():
#     temp = np.abs(monthly[monthly["item_id"] == i]["weight"].corr(monthly[monthly["item_id"] == i]["value"]))
#     print(temp)
#     if temp < min_:
#         min_ = temp
#     if temp > max_:

In [None]:
monthly.pivot(index="item_id", columns="ym",values=["value","weight"]).fillna(0.0).T

Unnamed: 0_level_0,item_id,AANGBULD,AHMDUILJ,ANWUJOKX,APQGTRMF,ATLDMDBO,AXULOHBQ,BEZYMBBT,BJALXPFS,BLANHGYY,BSRMSVTC,...,XIFHSOWQ,XIIEJNEE,XIPPENFQ,XMKRPGLB,XUOIQPFL,YSYHGLQK,ZCELVYQU,ZGJXVMNI,ZKENOUDA,ZXERAXWP
Unnamed: 0_level_1,ym,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
value,2022-01,14276.0,242705.0,0.0,383999.0,143097177.0,11429.0,6034752.0,352532.0,0.0,1216269.0,...,7718.0,0.0,108699.0,879207.0,10871069.0,0.0,373859.0,1154724.0,122627.0,10867.0
value,2022-02,52347.0,120847.0,0.0,512813.0,103568323.0,16365.0,6221091.0,218947.0,891.0,156005.0,...,24829.0,81.0,78853.0,1650028.0,9573079.0,543.0,59900.0,1337622.0,38162.0,675.0
value,2022-03,53549.0,197317.0,0.0,217064.0,118403737.0,47307.0,4611537.0,331472.0,0.0,801085.0,...,102019.0,3198210.0,172237.0,1831614.0,8774783.0,766.0,31158.0,1662893.0,3592.0,20548.0
value,2022-04,0.0,126142.0,63580.0,470398.0,121873741.0,59524.0,5202039.0,17480.0,0.0,365605.0,...,33215.0,0.0,89681.0,705567.0,4713153.0,1108.0,594407.0,1561647.0,5613.0,662.0
value,2022-05,26997.0,71730.0,81670.0,539873.0,115024617.0,74828.0,6889516.0,234330.0,23141.0,168725.0,...,82348.0,81175.0,81387.0,1718634.0,4612292.0,859.0,648232.0,1603223.0,33924.0,18949.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
weight,2025-03,0.0,43332.0,0.0,2777.0,76888377.0,6678.0,5338.0,27832.0,15.0,901.0,...,544.0,25.0,55115.0,4947732.0,1035737.0,7.0,30027.0,4098.0,1326.0,3507.0
weight,2025-04,49050.0,44913.0,0.0,347.0,119375444.0,9496.0,4594.0,66763.0,15.0,1356.0,...,730.0,50.0,323952.0,1666415.0,1380280.0,0.0,36070.0,4075.0,1562.0,4258.0
weight,2025-05,0.0,44035.0,0.0,335.0,112349280.0,13021.0,4008.0,62149.0,60.0,517.0,...,2881.0,14200.0,127323.0,4474885.0,1134783.0,0.0,9440.0,3906.0,1227.0,4598.0
weight,2025-06,865246.0,25574.0,0.0,4974.0,95457203.0,8195.0,3429.0,121446.0,12.0,30.0,...,748.0,0.0,190036.0,4223478.0,750994.0,5.0,19100.0,4864.0,41324.0,36632.0


In [None]:
import pandas as pd

def aggregate_monthly(df: pd.DataFrame,
                      year_col: str = "year",
                      month_col: str = "month",
                      item_col: str = "item_id",
                      value_col: str = "value",
                      weight_col: str = "weight",
                      quantity_col: str = "quantity",
                      ensure_continuous_months: bool = False) -> pd.DataFrame:
    """
    동일 item_id를 년도별 동일 월로 합산하여
    <ITEM>_value, <ITEM>_weight, <ITEM>_quantity 컬럼으로 와이드 피벗한 결과를 반환합니다.

    Parameters
    ----------
    df : pd.DataFrame
        원본 데이터 (컬럼: item_id, year, month, value, weight, quantity 포함)
    ensure_continuous_months : bool
        True면 데이터의 최소~최대 ym 사이의 모든 달을 채워 결측을 0.0으로 채움

    Returns
    -------
    out : pd.DataFrame
        컬럼: ym, <ITEM>_value, <ITEM>_weight, <ITEM>_quantity ...
    """

    # 1) ym(YYYY-MM) 생성
    df = df.copy()
    df["ym"] = pd.to_datetime(dict(year=df[year_col], month=df[month_col], day=1)).dt.strftime("%Y-%m")

    # 2) ym × item_id별 합산
    agg = (
        df.groupby(["ym", item_col], as_index=False)
          .agg(
              weight_sum=(weight_col, "sum"),
              value_sum=(value_col, "sum"),
              quantity_sum=(quantity_col, "sum")
          )
    )

    # 3) 기준 ym 인덱스 정의
    if ensure_continuous_months:
        # 최소~최대 사이 모든 달 생성
        ym_dates = pd.to_datetime(agg["ym"] + "-01").sort_values().unique()
        start = ym_dates[0]
        end = ym_dates[-1]
        all_months_dt = pd.date_range(start=start, end=end, freq="MS")
        all_months = [d.strftime("%Y-%m") for d in all_months_dt]
    else:
        # 데이터에 존재하는 ym만
        all_months = sorted(agg["ym"].unique())

    # 4) 와이드 피벗 (value/weight/quantity → 각 item별 3컬럼)
    wide = (
        agg.pivot(index="ym", columns=item_col, values=["value_sum", "weight_sum", "quantity_sum"])
           .reindex(all_months)        # 지정한 ym 순서로 정렬
           .sort_index()
    )

    # 5) 결측치 0.0
    wide = wide.fillna(0.0)

    # 6) 컬럼 평탄화: "<ITEM>_value", "<ITEM>_weight", "<ITEM>_quantity"
    flat_cols = []
    for top, item in wide.columns.to_flat_index():
        if top == "value_sum":
            flat_cols.append(f"{item}_value")
        elif top == "weight_sum":
            flat_cols.append(f"{item}_weight")
        elif top == "quantity_sum":
            flat_cols.append(f"{item}_quantity")
        else:
            flat_cols.append(f"{item}_{top}")
    wide.columns = flat_cols

    # 7) 아이템별로 [value → weight → quantity] 순서 정렬
    def _tri_order(cols):
        from collections import defaultdict
        bucket = defaultdict(dict)
        for c in cols:
            if c.endswith("_value"):
                bucket[c[:-6]]["v"] = c
            elif c.endswith("_weight"):
                bucket[c[:-7]]["w"] = c
            elif c.endswith("_quantity"):
                bucket[c[:-9]]["q"] = c
        ordered = []
        for k in sorted(bucket.keys()):  # item_id 알파벳순 정렬 (원하시면 커스텀 가능)
            if "v" in bucket[k]: ordered.append(bucket[k]["v"])
            if "w" in bucket[k]: ordered.append(bucket[k]["w"])
            if "q" in bucket[k]: ordered.append(bucket[k]["q"])
        return ordered

    wide = wide[_tri_order(wide.columns)]

    # 8) ym을 컬럼으로 복원
    out = wide.reset_index()

    return out

In [None]:
out_df = aggregate_monthly(train, ensure_continuous_months=False)

In [57]:
sorted_corr["IGDVVKUD"]

item_id
AHMDUILJ    0.326140
ATLDMDBO    0.489337
DBWLZWNK    0.333462
IGDVVKUD    1.000000
JPBRUTWP    0.426729
OGAFEHLU    0.305873
QRKRBYJL    0.344491
UGEQLMXM    0.656371
VBYCLTYZ    0.326472
XIIEJNEE    0.819554
XUOIQPFL    0.375931
Name: IGDVVKUD, dtype: float64

In [56]:
pivot.T[sorted_corr["IGDVVKUD"].index.to_numpy()].corr()

item_id,AHMDUILJ,ATLDMDBO,DBWLZWNK,IGDVVKUD,JPBRUTWP,OGAFEHLU,QRKRBYJL,UGEQLMXM,VBYCLTYZ,XIIEJNEE,XUOIQPFL
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AHMDUILJ,1.0,0.257107,-0.084639,0.32614,0.10872,0.233859,0.232609,0.122475,0.21595,0.234515,0.332079
ATLDMDBO,0.257107,1.0,0.373402,0.489337,0.517955,0.48074,0.704341,0.402896,0.640324,0.270303,0.707539
DBWLZWNK,-0.084639,0.373402,1.0,0.333462,0.396996,0.35,0.473373,0.368542,0.401759,0.392626,0.25743
IGDVVKUD,0.32614,0.489337,0.333462,1.0,0.426729,0.305873,0.344491,0.656371,0.326472,0.819554,0.375931
JPBRUTWP,0.10872,0.517955,0.396996,0.426729,1.0,0.535416,0.649055,0.459927,0.663127,0.432921,0.420958
OGAFEHLU,0.233859,0.48074,0.35,0.305873,0.535416,1.0,0.534762,0.45896,0.405969,0.39767,0.580928
QRKRBYJL,0.232609,0.704341,0.473373,0.344491,0.649055,0.534762,1.0,0.477197,0.779633,0.365507,0.499827
UGEQLMXM,0.122475,0.402896,0.368542,0.656371,0.459927,0.45896,0.477197,1.0,0.341591,0.705846,0.411413
VBYCLTYZ,0.21595,0.640324,0.401759,0.326472,0.663127,0.405969,0.779633,0.341591,1.0,0.333559,0.34519
XIIEJNEE,0.234515,0.270303,0.392626,0.819554,0.432921,0.39767,0.365507,0.705846,0.333559,1.0,0.341324


In [None]:
[sorted_corr[REF_ITEM].index.to_numpy()]

array(['AANGBULD', 'BEZYMBBT', 'DNMPSKTB', 'ELQGMQWE', 'FQCLOEXA',
       'GYHKIVQT', 'HXYSSRXE', 'JSLXRQOK', 'KFQSHBNH', 'NAQIHUKZ',
       'RJGPVEXX', 'UQYUIVVR', 'VMAQSTJE', 'VUAFAIYJ', 'WPQXWHYO',
       'XUOIQPFL', 'ZGJXVMNI', 'ZKENOUDA', 'ZXERAXWP'], dtype=object)

In [58]:
import plotly.express as px
REF_ITEM = "IGDVVKUD"
# train의 앞 5개 열만 사용하여 이름 지정
target_col = np.append(["ym"],sorted_corr[REF_ITEM].index.to_numpy())
df = df_minmax[target_col]
# df = pivot.T.reset_index()

# df.columns = ["ym", "AANGBULD", "AHMDUILJ", "ANWUJOKX", "APQGTRMF"]  # R의 structure(names=...)와 동일

# pivot_longer() → melt()
lme_long = df.melt(id_vars="ym", var_name="Metal", value_name="value")

# ggplot(line) → plotly line
fig = px.line(
    lme_long,
    x="ym",
    y="value",
    color="Metal",
    # title="LME : Closing Price of Industrial Metals",
    labels={"date_id": "Date (date_id)", "value": ""}
)

# R의 figsize(16, 6) 비슷하게 크기 조정 (픽셀 단위)
fig.update_layout(width=1100, height=400, legend_title_text="Metal", margin=dict(l=40, r=20, t=60, b=40))

fig.show()

In [231]:
df = pivot.T.reset_index()
# scaler = MinMaxScaler()
scaler = StandardScaler()
df_minmax = pd.DataFrame(scaler.fit_transform(df.drop("ym",axis=1)),columns=df.drop("ym",axis=1).columns).set_index(df["ym"]).reset_index()

In [230]:
df = np.log1p(pivot).T.reset_index()
# scaler_log = MinMaxScaler()
scaler_log = StandardScaler()
df_minmax_log = pd.DataFrame(scaler_log.fit_transform(df.drop("ym",axis=1)),columns=df.drop("ym",axis=1).columns).set_index(df["ym"]).reset_index()

In [78]:
df_vw = pivot_w.T.reset_index()
scaler_vw = MinMaxScaler()
# scaler = StandardScaler()
df_minmax_vw = pd.DataFrame(scaler_vw.fit_transform(df_vw.drop("ym",axis=1)),columns=df_vw.drop("ym",axis=1).columns).set_index(df_vw["ym"]).reset_index()

In [8]:
df_minmax_vw

item_id,ym,AANGBULD,AHMDUILJ,ANWUJOKX,APQGTRMF,ATLDMDBO,AXULOHBQ,BEZYMBBT,BJALXPFS,BLANHGYY,...,XIFHSOWQ,XIIEJNEE,XIPPENFQ,XMKRPGLB,XUOIQPFL,YSYHGLQK,ZCELVYQU,ZGJXVMNI,ZKENOUDA,ZXERAXWP
0,2022-01,0.016849,1.0,0.0,0.42196,0.97641,0.003063,0.798254,0.336436,0.0,...,0.0,0.0,0.09118,0.338734,1.0,0.0,0.380084,0.0,0.00392,0.058549
1,2022-02,0.064991,0.41821,0.0,0.684553,0.487511,0.0,0.687247,0.635654,0.0288,...,0.01824,0.0,0.198996,0.803,0.823894,0.119403,0.053349,0.090703,0.002396,0.001038
2,2022-03,0.066483,0.627174,0.0,0.364096,0.667319,0.019964,0.427814,1.0,0.0,...,0.01338,1.0,0.497292,0.871618,0.69516,0.283582,0.317165,0.133906,7.3e-05,0.119883
3,2022-04,0.0,0.410051,0.7501,0.525464,0.53826,0.066421,0.569068,0.003147,0.0,...,0.00098,0.0,0.0325,0.23651,0.197199,0.343284,1.0,0.143859,0.000266,5.5e-05
4,2022-05,0.032669,0.196033,0.992096,0.711884,0.417215,0.056587,0.883536,0.529436,0.0496,...,0.01291,0.001809,0.046846,0.775913,0.144324,0.253731,0.849306,0.07962,0.00196,0.076108
5,2022-06,0.099104,0.579768,0.347249,0.314623,0.0,0.022382,0.666823,0.028241,0.0032,...,0.00194,2e-05,0.049066,0.877083,0.041384,0.38806,0.268768,0.158109,0.004743,0.001748
6,2022-07,0.0,0.601636,0.115791,0.963414,0.829118,0.050004,0.951824,0.116221,0.0416,...,0.019917,0.0,0.110996,0.132987,0.420674,0.149254,0.233553,0.204931,0.485239,0.100959
7,2022-08,0.0,0.62224,0.0,0.330427,0.369801,0.083403,0.545058,0.075742,0.008,...,0.004895,0.0,0.19644,0.27006,0.310957,0.164179,0.0,0.116716,0.48686,0.002376
8,2022-09,0.0,0.598178,0.0,0.881574,0.139457,0.461456,0.86358,0.272956,0.008,...,0.001049,0.0,0.399991,0.287922,0.265441,0.0,0.322727,0.129156,0.02432,0.010131
9,2022-10,0.0,0.185053,1.0,0.362524,0.299683,1.0,0.94855,0.767562,0.0752,...,0.028885,0.0,0.150555,0.192424,0.165787,0.447761,0.064964,0.109251,0.002347,0.051312


In [None]:
import plotly.express as px

uss_df = df_minmax_log.copy()

# pivot_longer → melt
uss_long = uss_df.melt(id_vars="ym", var_name="item", value_name="value")

# ggplot(line) + facet_wrap(~ Stock, ncol=6) → plotly line + facet_col_wrap
fig = px.line(
    uss_long,
    x="ym",
    y="value",
    color="item",
    facet_col="item",
    facet_col_wrap=6,
    facet_row_spacing=0.02,   # ★ 세로 간격 축소
    color_discrete_sequence=["#2962FF"],
)

# 패싯 라벨 정리 & 범례 제거 & 레이아웃 크기 조정
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(width=1800, height=2100, showlegend=False)

# 주의: y축을 0~150으로 제한해 대부분의 티커 가독성을 높였습니다. 고가 티커는 전체 범위가 보이지 않을 수 있습니다
fig.show()

In [None]:
import plotly.express as px

uss_df = df_minmax.copy()

# # R의 structure(names = c("date_id", uss_tickers))와 동일
# uss_df.columns = ["date_id"] + list(uss_tickers)

# pivot_longer → melt
uss_long = uss_df.melt(id_vars="ym", var_name="item", value_name="value")

# ggplot(line) + facet_wrap(~ Stock, ncol=6) → plotly line + facet_col_wrap
fig = px.line(
    uss_long,
    x="ym",
    y="value",
    color="item",
    facet_col="item",
    facet_col_wrap=6,
    facet_row_spacing=0.02,   # ★ 세로 간격 축소
    color_discrete_sequence=["#2962FF"],
)

# 패싯 라벨 정리 & 범례 제거 & 레이아웃 크기 조정
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(width=1800, height=2100, showlegend=False)

# 주의: y축을 0~150으로 제한해 대부분의 티커 가독성을 높였습니다. 고가 티커는 전체 범위가 보이지 않을 수 있습니다
fig.show()

In [119]:
import pandas as pd
import plotly.express as px

# 0) 공통 전제: 두 DF 모두 첫 컬럼이 'ym', 나머지가 item 컬럼들
#    (예: df_minmax.columns = ['ym', 'A', 'B', ...])
#    x축(ym) 월 구성이 같다고 가정

# 1) 길게 만들기 + 출처(series) 라벨 부여
long_a = (
    df_minmax
      .melt(id_vars="ym", var_name="item", value_name="value")
      .assign(series="minmax")
)
long_b = (
    df_minmax_log
      .melt(id_vars="ym", var_name="item", value_name="value")
      .assign(series="minmax_vw")
)

# 2) (선택) 공통 item만 사용하고 싶다면 inner merge 대신 교집합 필터
common_items = sorted(set(long_a["item"]).intersection(set(long_b["item"])))
long_a = long_a[long_a["item"].isin(common_items)]
long_b = long_b[long_b["item"].isin(common_items)]

# 3) 합치기
plot_df = pd.concat([long_a, long_b], ignore_index=True)

# 4) 라인 플롯: 패싯은 item, 색상/스타일은 series(두 개 라인)
fig = px.line(
    plot_df,
    x="ym",
    y="value",
    color="series",           # 각 패싯에 'minmax' vs 'minmax_vw'
    line_dash="series",       # 두 라인의 dash를 달리해서 더 잘 구분
    facet_col="item",
    facet_col_wrap=6,
    facet_row_spacing=0.02,
    # 원하시는 색이 있다면 아래 맵을 쓰세요 (없으면 Plotly 기본 팔레트 사용)
    color_discrete_map={
        "minmax": "#2962FF",
        "minmax_vw": "#3B3A3A",
    },
    category_orders={"series": ["minmax", "minmax_vw"]},
)

# 5) 패싯 제목 간소화(=뒤 텍스트만 남기기) + 레이아웃
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(width=1800, height=2100, legend_title_text="Series", showlegend=True)

# 6) (선택) y축 범위 고정
#    두 시계열이 동일 스케일이라면 아래처럼 고정 가능
# for ax in fig.layout:
#     if ax.startswith("yaxis"):
#         fig.layout[ax].update(range=[0, 150])

# 7) (선택) hover 깔끔하게
fig.update_traces(hovertemplate="ym=%{x}<br>%{legendgroup}: %{y:.2f}")

fig.show()

## 3. 공행성쌍 탐색
- 각 (A, B) 쌍에 대해 lag = 1 ~ max_lag까지 Pearson 상관계수 계산
- 절댓값이 가장 큰 상관계수와 lag를 선택
- |corr| >= corr_threshold이면 A→B 공행성 있다고 판단

In [79]:
def safe_corr(x, y):
    if np.std(x) == 0 or np.std(y) == 0:
        return 0.0
    return float(np.corrcoef(x, y)[0, 1])

def find_comovement_pairs(pivot, max_lag=6, min_nonzero=12, corr_threshold=0.4):
    '''
    pivot = 상관관계를 비교할 column 리스트
    max_lag = 지연일
    min_nonzero = 무역량이 존재하는 달의 최소 수치
    corr_threshold = 상관계수 임계값
    '''
    items = pivot.index.to_list()                           # item_id 값 추출
    months = pivot.columns.to_list()                        # 시간 정보 추출
    n_months = len(months)                                  # 시간 정보 최대 길이
    results = []                                            # 결과 저장 용 빈 리스트
    
    for i, leader in tqdm(enumerate(items)):                # 단일 item_id 별 반복
        x = pivot.loc[leader].values.astype(float)          # 단일 item 시간 별 무역량
        if np.count_nonzero(x) < min_nonzero:               # 무역량이 존재하는 달의 총합 수가 지정한 최소 수치(min_nonzero)를 넘는 지 판별
            continue

        for follower in items:                              # target_item을 제외한 다른 item과 corr 비교
            if follower == leader:
                continue

            y = pivot.loc[follower].values.astype(float)    # 비교할 item의 무역량 최소 수치 판별
            if np.count_nonzero(y) < min_nonzero:
                continue

            best_lag = None                                 # corr이 제일 높게 나오는 지연일
            best_corr = 0.0                                 # 제일 높은 corr

            # lag = 1 ~ max_lag 탐색
            for lag in range(1, max_lag + 1):               # 1일 차이부터 max_lag까지 corr 비교
                if n_months <= lag:                         # 아마도 max_lag 값 오입력 할 경우 예외 처리
                    print("이게 통과하는 경우가 있어?")
                    continue
                corr = safe_corr(x[:-lag], y[lag:]) 
                if abs(corr) > abs(best_corr):
                    best_corr = corr
                    best_lag = lag

            # 임계값 이상이면 공행성쌍으로 채택
            if best_lag is not None and abs(best_corr) >= corr_threshold:
                results.append({
                    "leading_item_id": leader,
                    "following_item_id": follower,
                    "best_lag": best_lag,
                    "max_corr": best_corr,
                })

    pairs = pd.DataFrame(results)                           # 반환을 위한 데이터프레임화
    return pairs

def find_comovement_multi_pairs(pivot_v, pivot_w, max_lag=6, min_nonzero=12, corr_threshold=0.4):
    '''
    pivot = 상관관계를 비교할 column 리스트
    max_lag = 지연일
    min_nonzero = 무역량이 존재하는 달의 최소 수치
    corr_threshold = 상관계수 임계값
    '''
    items = pivot_v.index.to_list()                           # item_id 값 추출
    months = pivot_v.columns.to_list()                        # 시간 정보 추출
    n_months = len(months)                                  # 시간 정보 최대 길이
    results = []                                            # 결과 저장 용 빈 리스트
    
    for i, leader in tqdm(enumerate(items)):                # 단일 item_id 별 반복
        x = pivot_v.loc[leader].values.astype(float)          # 단일 item 시간 별 무역량
        x_w = pivot_w.loc[leader].values.astype(float) 
        if np.count_nonzero(x) < min_nonzero:               # 무역량이 존재하는 달의 총합 수가 지정한 최소 수치(min_nonzero)를 넘는 지 판별
            continue

        for follower in items:                              # target_item을 제외한 다른 item과 corr 비교
            if follower == leader:
                continue

            y = pivot_v.loc[follower].values.astype(float)    # 비교할 item의 무역량 최소 수치 판별
            y_w = pivot_w.loc[follower].values.astype(float)
            if np.count_nonzero(y) < min_nonzero:
                continue

            best_lag = None                                 # corr이 제일 높게 나오는 지연일
            best_corr = 0.0                                 # 제일 높은 corr

            # lag = 1 ~ max_lag 탐색
            for lag in range(1, max_lag + 1):               # 1일 차이부터 max_lag까지 corr 비교
                if n_months <= lag:                         # 아마도 max_lag 값 오입력 할 경우 예외 처리
                    print("이게 통과하는 경우가 있어?")
                    continue
                # corr = safe_corr(x[:-lag], y[lag:]) 
                corr_v = safe_corr(x[:-lag], y[lag:])
                corr_w = safe_corr(x_w[:-lag], y_w[lag:])
                corr = (corr_v + corr_w)/2
                # corr = np.array([corr_v, corr_w, (corr_v + corr_w)/2])
                # print(corr)
                # if np.argmax(np.abs(corr)) == 1:
                #     print(corr)
                # corr = corr[np.argmax(np.abs(corr))]
                if abs(corr) > abs(best_corr):
                    best_corr = corr
                    best_lag = lag

            # 임계값 이상이면 공행성쌍으로 채택
            if best_lag is not None and abs(best_corr) >= corr_threshold:
                results.append({
                    "leading_item_id": leader,
                    "following_item_id": follower,
                    "best_lag": best_lag,
                    "max_corr": best_corr,
                })

    pairs = pd.DataFrame(results)                           # 반환을 위한 데이터프레임화
    return pairs

pairs2 = find_comovement_pairs(pivot, max_lag=12, corr_threshold=0.7)
# pairs2 = find_comovement_pairs(pivot, max_lag=6, min_nonzero=29, corr_threshold=0.3)
print("탐색된 공행성쌍 수:", len(pairs2))
pairs2.head()

100it [00:12,  8.01it/s]

탐색된 공행성쌍 수: 101





Unnamed: 0,leading_item_id,following_item_id,best_lag,max_corr
0,ATLDMDBO,AXULOHBQ,7,0.750089
1,ATLDMDBO,DNMPSKTB,8,0.779126
2,ATLDMDBO,GYHKIVQT,7,0.759125
3,ATLDMDBO,QRKRBYJL,1,0.817712
4,ATLDMDBO,QVLMOEYE,3,0.703682


In [419]:
import numpy as np
import pandas as pd
from scipy import stats

# -----------------------------
# 0) 하이퍼파라미터
# -----------------------------
MAX_LAG = 6                 # 탐색할 최대 지연월
MIN_NONZERO = 12            # 각 시계열에서 0이 아닌 달 최소 개수
MIN_OVERLAP = 24            # 랙 적용 후 실제 겹치는 구간 최소 길이
CORR_THRESH = 0.40          # 상관계수 절대값 임계치
ALPHA = 0.05                # 유의수준(신뢰구간/검정용)
USE_SPEARMAN = False        # True면 Spearman, False면 Pearson

# -----------------------------
# 1) 전처리 유틸
# -----------------------------
def winsorize(a, q=0.01):
    lo, hi = np.quantile(a[np.isfinite(a)], [q, 1-q])
    return np.clip(a, lo, hi)

def transform_series(v, mode="log1p-pct", winsor=0.01, zscore=True):
    """
    mode:
      - "level"        : 원자료
      - "log1p"        : log1p 변환
      - "log1p-pct"    : log1p 후 pct_change
      - "diff"         : 1차 차분
      - "diff12"       : 12차 계절차분
    """
    x = v.astype(float).copy()
    if mode == "log1p":
        x = np.log1p(x)
    elif mode == "log1p-pct":
        x = np.log1p(x)
        x = pd.Series(x).pct_change().to_numpy()
    elif mode == "diff":
        x = pd.Series(x).diff().to_numpy()
    elif mode == "diff12":
        x = pd.Series(x).diff(12).to_numpy()
    # 윈저라이즈
    if winsor:
        x = winsorize(x, q=winsor)
    # 표준화
    if zscore:
        m, s = np.nanmean(x), np.nanstd(x)
        if s > 0:
            x = (x - m) / s
    # 초기 변환으로 생긴 NaN은 제거 대신 0으로 채우되, 겹침 판단은 별도로 함
    x = np.where(np.isfinite(x), x, 0.0)
    return x

def lag_overlap_corr(x, y, lag, spearman=False):
    """
    lag>0: x가 선행(leader), y가 lag만큼 후행(follower)
    """
    if lag <= 0 or lag >= len(x):
        return np.nan, 0
    xw = x[:-lag]
    yw = y[lag:]
    # 실제 겹치는 유효 길이(둘 다 finite)
    mask = np.isfinite(xw) & np.isfinite(yw)
    xw = xw[mask]; yw = yw[mask]
    n = len(xw)
    if n < 2:
        return np.nan, 0
    if spearman:
        r, _ = stats.spearmanr(xw, yw)
    else:
        # 표준편차 0 방지
        if np.std(xw) == 0 or np.std(yw) == 0:
            return 0.0, n
        r = float(np.corrcoef(xw, yw)[0, 1])
    return r, n

def ar1_lag1_autocorr(x):
    """간단한 AR(1) 계수 추정(자유도 최소화)."""
    x = pd.Series(x).dropna().to_numpy()
    if len(x) < 3:
        return 0.0
    x1, x2 = x[:-1], x[1:]
    if np.std(x1)==0 or np.std(x2)==0:
        return 0.0
    return float(np.corrcoef(x1, x2)[0, 1])

def fisher_ci(r, n_eff, alpha=0.05):
    """Fisher z 변환 기반 신뢰구간."""
    r = np.clip(r, -0.999999, 0.999999)
    if n_eff <= 3:
        return -1.0, 1.0
    z = np.arctanh(r)
    se = 1/np.sqrt(n_eff-3)
    zc = stats.norm.ppf(1-alpha/2)
    lo, hi = np.tanh(z - zc*se), np.tanh(z + zc*se)
    return lo, hi

def effective_sample_size(n, rho_x1, rho_y1):
    """자기상관 보정된 유효 표본 크기(N_eff)의 근사."""
    num = n*(1 - rho_x1)*(1 - rho_y1)
    den = (1 + rho_x1)*(1 + rho_y1)
    if den <= 0:  # 수치 안정화
        den = 1e-6
    return max(num/den, 2)

# -----------------------------
# 2) 메인: 공행성(lead–lag) 페어 탐색
# -----------------------------
def find_leadlag_pairs(
    pivot, max_lag=MAX_LAG, min_nonzero=MIN_NONZERO, min_overlap=MIN_OVERLAP,
    corr_threshold=CORR_THRESH, alpha=ALPHA, transform="log1p-pct",
    use_spearman=USE_SPEARMAN
):
    items = pivot.index.to_list()
    months = pivot.columns.to_list()
    nT = len(months)

    # 사전 전처리 및 통계 준비
    series = {}
    ar1 = {}
    nonzero_cnt = {}
    raw = pivot.values  # (n_item, nT)

    for i, item in enumerate(items):
        v_raw = raw[i, :].astype(float)
        nonzero_cnt[item] = int(np.count_nonzero(v_raw))
        x = transform_series(v_raw, mode=transform,zscore=True)
        series[item] = x
        ar1[item] = ar1_lag1_autocorr(x)
        
    results = []
    for i, leader in enumerate(items):
        if nonzero_cnt[leader] < min_nonzero:
            continue
        x = series[leader]
        for j, follower in enumerate(items):
            if i == j:
                continue
            if nonzero_cnt[follower] < min_nonzero:
                continue
            y = series[follower]

            best = {"lag": None, "r": 0.0, "n": 0}
            # lag=1..max_lag 탐색 (leader 선행)
            for lag in range(1, max_lag+1):
                r, n = lag_overlap_corr(x, y, lag, spearman=use_spearman)
                if n < min_overlap:
                    continue
                if np.isnan(r):
                    continue
                if abs(r) > abs(best["r"]):
                    best = {"lag": lag, "r": r, "n": n}

            if best["lag"] is None:
                continue

            # 유효 표본 보정 후 신뢰구간
            n_eff = effective_sample_size(best["n"], ar1[leader], ar1[follower])
            lo, hi = fisher_ci(best["r"], n_eff, alpha=alpha)
            # 임계값/유의성 필터
            # if abs(lo-hi) > 0.5:
            if abs(best["r"]) >= corr_threshold:
            # if abs(best["r"]) >= corr_threshold and not (lo <= 0 <= hi):
            # if abs(best["r"]) >= corr_threshold and (lo <= 0 <= hi):
                results.append({
                    "leading_item_id": leader,
                    "following_item_id": follower,
                    "best_lag": best["lag"],
                    "max_corr": best["r"],
                    "overlap_n": int(best["n"]),
                    "n_eff": float(n_eff),
                    "ci_low": float(lo),
                    "ci_high": float(hi),
                    "ci_diff": abs(float(lo) - float(hi)),
                })
    pairs = pd.DataFrame(results)
    # pairs = pd.DataFrame(results).sort_values(["max_corr"], ascending=False, ignore_index=True)
    return pairs

# -----------------------------
# 3) 실행 예시
# -----------------------------
# 위에서 이미 pivot을 생성하셨으므로 바로 호출:
pairs = find_leadlag_pairs(
    # pivot,
    pivot.iloc[:,:-1],
    max_lag=6,
    min_nonzero=12,
    min_overlap=24,
    corr_threshold=0.4,
    alpha=0.05,
    # transform="log1p",   # 추천: 규모/추세 영향 완화
    transform="level",
    use_spearman=False       # 필요시 True로 바꿔 비선형 순위관계 테스트
)

print("탐색된 공행성쌍 수:", len(pairs))
pairs.head()

탐색된 공행성쌍 수: 1441


Unnamed: 0,leading_item_id,following_item_id,best_lag,max_corr,overlap_n,n_eff,ci_low,ci_high,ci_diff
0,AANGBULD,APQGTRMF,5,-0.463538,37,7.700399,-0.886606,0.381844,1.268451
1,AANGBULD,DEWLVASR,6,0.666509,36,6.454782,-0.24496,0.952578,1.197538
2,AANGBULD,DNMPSKTB,4,-0.433244,38,2.438159,-1.0,1.0,2.0
3,AANGBULD,EVBVXETX,6,0.453901,36,5.98186,-0.568579,0.925292,1.493871
4,AANGBULD,FTSVTTSR,3,0.500582,39,22.216505,0.102614,0.760411,0.657797


In [420]:
# 아이템 별 pair 갯수 -> 페어 item id를 워한다면 len() 제거
pairs_count = {} 
for i in pairs.leading_item_id.unique():
    pairs_count[i] = len(pairs[pairs["leading_item_id"] == i]["following_item_id"].values)
pairs_count


{'AANGBULD': 19,
 'AHMDUILJ': 9,
 'APQGTRMF': 28,
 'ATLDMDBO': 29,
 'AXULOHBQ': 24,
 'BEZYMBBT': 14,
 'BJALXPFS': 14,
 'BLANHGYY': 22,
 'BSRMSVTC': 18,
 'BTMOEMEP': 28,
 'BUZIIBYG': 8,
 'CCLHWFWF': 16,
 'DBWLZWNK': 20,
 'DDEXPPXU': 6,
 'DEWLVASR': 21,
 'DJBLNPNC': 9,
 'DNMPSKTB': 39,
 'DUCMGGNW': 5,
 'ELQGMQWE': 17,
 'EVBVXETX': 22,
 'FCYBOAXC': 6,
 'FDXPMYGF': 16,
 'FITUEHWN': 6,
 'FQCLOEXA': 22,
 'FRHNWLNI': 13,
 'FTSVTTSR': 11,
 'FWUCPMMW': 11,
 'GKQIJYDH': 3,
 'GYHKIVQT': 36,
 'HCDTGMST': 10,
 'HXYSSRXE': 24,
 'IGDVVKUD': 20,
 'JBVHSUWY': 10,
 'JERHKLYW': 5,
 'JPBRUTWP': 32,
 'JSLXRQOK': 11,
 'KAGJCHMR': 10,
 'KEUWZRKO': 11,
 'KJNSOAHR': 12,
 'LLHREMKS': 10,
 'LPHPPJUG': 15,
 'LRVGFDFM': 27,
 'LSOIUSXD': 17,
 'LTOYKIML': 10,
 'LUENUFGA': 12,
 'MBSBZBXA': 4,
 'MIRCVAMV': 7,
 'NAQIHUKZ': 9,
 'NZKBIBNU': 17,
 'OGAFEHLU': 34,
 'OJIFIHMZ': 7,
 'OKMBFVKS': 22,
 'OXKURKXR': 27,
 'PYZMVUWD': 15,
 'QJQJSWFU': 5,
 'QKXNTIIB': 6,
 'QRKRBYJL': 35,
 'QVLMOEYE': 31,
 'RAWUKQMJ': 13,
 'RCBZUSIM':

In [421]:
# 아이템 별 pair 갯수 -> 페어 item id를 워한다면 len() 제거
pairs2_count = {}
for i in pairs2.leading_item_id.unique():
    pairs2_count[i] = len(pairs2[pairs2["leading_item_id"] == i]["following_item_id"].values)
pairs2_count

{'AANGBULD': 19,
 'AHMDUILJ': 9,
 'APQGTRMF': 28,
 'ATLDMDBO': 29,
 'AXULOHBQ': 24,
 'BEZYMBBT': 14,
 'BJALXPFS': 14,
 'BLANHGYY': 22,
 'BSRMSVTC': 18,
 'BTMOEMEP': 28,
 'BUZIIBYG': 8,
 'CCLHWFWF': 16,
 'DBWLZWNK': 20,
 'DDEXPPXU': 6,
 'DEWLVASR': 21,
 'DJBLNPNC': 9,
 'DNMPSKTB': 39,
 'DUCMGGNW': 5,
 'ELQGMQWE': 17,
 'EVBVXETX': 22,
 'FCYBOAXC': 6,
 'FDXPMYGF': 16,
 'FITUEHWN': 6,
 'FQCLOEXA': 22,
 'FRHNWLNI': 13,
 'FTSVTTSR': 11,
 'FWUCPMMW': 11,
 'GKQIJYDH': 3,
 'GYHKIVQT': 36,
 'HCDTGMST': 10,
 'HXYSSRXE': 24,
 'IGDVVKUD': 20,
 'JBVHSUWY': 10,
 'JERHKLYW': 5,
 'JPBRUTWP': 32,
 'JSLXRQOK': 11,
 'KAGJCHMR': 10,
 'KEUWZRKO': 11,
 'KJNSOAHR': 12,
 'LLHREMKS': 10,
 'LPHPPJUG': 15,
 'LRVGFDFM': 27,
 'LSOIUSXD': 17,
 'LTOYKIML': 10,
 'LUENUFGA': 12,
 'MBSBZBXA': 4,
 'MIRCVAMV': 7,
 'NAQIHUKZ': 9,
 'NZKBIBNU': 17,
 'OGAFEHLU': 34,
 'OJIFIHMZ': 7,
 'OKMBFVKS': 22,
 'OXKURKXR': 27,
 'PYZMVUWD': 15,
 'QJQJSWFU': 5,
 'QKXNTIIB': 6,
 'QRKRBYJL': 35,
 'QVLMOEYE': 31,
 'RAWUKQMJ': 13,
 'RCBZUSIM':

In [422]:
# 아이템 별 pair 갯수 -> 페어 item id를 워한다면 len() 제거
sub1_count = {}
for i in sub1.leading_item_id.unique():
    sub1_count[i] = len(sub1[sub1["leading_item_id"] == i]["following_item_id"].values)
sub1_count

{'AANGBULD': 16,
 'AHMDUILJ': 10,
 'APQGTRMF': 28,
 'ATLDMDBO': 28,
 'AXULOHBQ': 24,
 'BEZYMBBT': 17,
 'BJALXPFS': 13,
 'BLANHGYY': 25,
 'BSRMSVTC': 20,
 'BTMOEMEP': 29,
 'BUZIIBYG': 10,
 'CCLHWFWF': 13,
 'DBWLZWNK': 19,
 'DDEXPPXU': 2,
 'DEWLVASR': 20,
 'DJBLNPNC': 12,
 'DNMPSKTB': 36,
 'DUCMGGNW': 4,
 'ELQGMQWE': 19,
 'EVBVXETX': 20,
 'FCYBOAXC': 4,
 'FDXPMYGF': 18,
 'FITUEHWN': 4,
 'FQCLOEXA': 20,
 'FRHNWLNI': 11,
 'FTSVTTSR': 11,
 'FWUCPMMW': 11,
 'GKQIJYDH': 4,
 'GYHKIVQT': 32,
 'HCDTGMST': 11,
 'HXYSSRXE': 25,
 'IGDVVKUD': 21,
 'JBVHSUWY': 13,
 'JERHKLYW': 6,
 'JPBRUTWP': 29,
 'JSLXRQOK': 10,
 'KAGJCHMR': 10,
 'KEUWZRKO': 11,
 'KJNSOAHR': 12,
 'LLHREMKS': 17,
 'LPHPPJUG': 14,
 'LRVGFDFM': 26,
 'LSOIUSXD': 17,
 'LTOYKIML': 11,
 'LUENUFGA': 12,
 'MBSBZBXA': 4,
 'MIRCVAMV': 8,
 'NAQIHUKZ': 10,
 'NZKBIBNU': 20,
 'OGAFEHLU': 32,
 'OJIFIHMZ': 6,
 'OKMBFVKS': 24,
 'OXKURKXR': 24,
 'PYZMVUWD': 15,
 'QJQJSWFU': 4,
 'QKXNTIIB': 7,
 'QRKRBYJL': 34,
 'QVLMOEYE': 30,
 'RAWUKQMJ': 13,
 'RCBZUS

In [423]:
# pairs = find_comovement_multi_pairs(pivot, pivot_w, max_lag=1, min_nonzero=0, corr_threshold=0.3)
# print("탐색된 공행성쌍 수:", len(pairs))
# pairs.head()

In [424]:
pairs["leading_item_id"].unique() # 무역량이 최소 6개 초과 존재하는 item 리스트

array(['AANGBULD', 'AHMDUILJ', 'APQGTRMF', 'ATLDMDBO', 'AXULOHBQ',
       'BEZYMBBT', 'BJALXPFS', 'BLANHGYY', 'BSRMSVTC', 'BTMOEMEP',
       'BUZIIBYG', 'CCLHWFWF', 'DBWLZWNK', 'DDEXPPXU', 'DEWLVASR',
       'DJBLNPNC', 'DNMPSKTB', 'DUCMGGNW', 'ELQGMQWE', 'EVBVXETX',
       'FCYBOAXC', 'FDXPMYGF', 'FITUEHWN', 'FQCLOEXA', 'FRHNWLNI',
       'FTSVTTSR', 'FWUCPMMW', 'GKQIJYDH', 'GYHKIVQT', 'HCDTGMST',
       'HXYSSRXE', 'IGDVVKUD', 'JBVHSUWY', 'JERHKLYW', 'JPBRUTWP',
       'JSLXRQOK', 'KAGJCHMR', 'KEUWZRKO', 'KJNSOAHR', 'LLHREMKS',
       'LPHPPJUG', 'LRVGFDFM', 'LSOIUSXD', 'LTOYKIML', 'LUENUFGA',
       'MBSBZBXA', 'MIRCVAMV', 'NAQIHUKZ', 'NZKBIBNU', 'OGAFEHLU',
       'OJIFIHMZ', 'OKMBFVKS', 'OXKURKXR', 'PYZMVUWD', 'QJQJSWFU',
       'QKXNTIIB', 'QRKRBYJL', 'QVLMOEYE', 'RAWUKQMJ', 'RCBZUSIM',
       'RJGPVEXX', 'ROACSLMG', 'SAAYMURU', 'SAHWCZNH', 'SDWAYPIK',
       'SNHYOVBM', 'STZDBITS', 'SUOYXCHP', 'TGOELCAG', 'UGEQLMXM',
       'UIFPPCLR', 'UQYUIVVR', 'UXSPKBJR', 'VBYCLTYZ', 'VMAQST

In [470]:
c, p, n, d = count_p(pairs, sub1)
p, n, d

(26, 2300, -2274)

In [471]:
c

{'AANGBULD': [19, 41, 19, 0, 22],
 'AHMDUILJ': [9, 42, 9, 0, 33],
 'APQGTRMF': [28, 53, 27, 1, 26],
 'ATLDMDBO': [29, 56, 28, 1, 28],
 'AXULOHBQ': [24, 39, 23, 1, 16],
 'BEZYMBBT': [14, 43, 14, 0, 29],
 'BJALXPFS': [14, 44, 14, 0, 30],
 'BLANHGYY': [22, 48, 22, 0, 26],
 'BSRMSVTC': [18, 38, 18, 0, 20],
 'BTMOEMEP': [28, 55, 28, 0, 27],
 'BUZIIBYG': [8, 38, 7, 1, 31],
 'CCLHWFWF': [16, 36, 15, 1, 21],
 'DBWLZWNK': [20, 57, 20, 0, 37],
 'DDEXPPXU': [6, 21, 5, 1, 16],
 'DEWLVASR': [21, 47, 19, 2, 28],
 'DJBLNPNC': [9, 32, 9, 0, 23],
 'DNMPSKTB': [39, 59, 37, 2, 22],
 'DUCMGGNW': [5, 26, 5, 0, 21],
 'ELQGMQWE': [17, 47, 17, 0, 30],
 'EVBVXETX': [22, 44, 22, 0, 22],
 'FCYBOAXC': [6, 30, 6, 0, 24],
 'FDXPMYGF': [16, 33, 16, 0, 17],
 'FITUEHWN': [6, 27, 5, 1, 22],
 'FQCLOEXA': [22, 43, 22, 0, 21],
 'FRHNWLNI': [13, 52, 13, 0, 39],
 'FTSVTTSR': [11, 34, 11, 0, 23],
 'FWUCPMMW': [11, 37, 11, 0, 26],
 'GKQIJYDH': [3, 24, 2, 1, 22],
 'GYHKIVQT': [36, 57, 35, 1, 22],
 'HCDTGMST': [10, 37, 10, 0, 2

In [506]:
REF_ITEM = "DEWLVASR" 
target_col = np.append(["ym",REF_ITEM],pairs[pairs["leading_item_id"] == REF_ITEM]["following_item_id"].values)
target_col_sub = np.append(["ym",REF_ITEM],sub1[sub1["leading_item_id"] == REF_ITEM]["following_item_id"].values)
len(target_col)-2

21

In [507]:
temp_inter = np.intersect1d(target_col,target_col_sub,)
# temp_inter = np.append(temp_inter)
len(temp_inter)-2

19

In [508]:
temp_tar = np.setdiff1d(target_col,target_col_sub,)
temp_tar = np.append(["ym",REF_ITEM],temp_tar)
len(temp_tar)-2

2

In [509]:
temp_tar_sub = np.setdiff1d(target_col_sub,target_col,)
temp_tar_sub = np.append(["ym",REF_ITEM],temp_tar_sub)
len(temp_tar_sub)-2

28

In [510]:
visualize_(df_minmax, pairs, REF_ITEM, temp_inter)

In [511]:
visualize_(df_minmax, pairs, REF_ITEM, temp_tar)

In [512]:
visualize_(df_minmax, pairs, REF_ITEM, temp_tar_sub)

In [513]:
np.array(list(sorted_corr.keys()))

array(['AANGBULD', 'AHMDUILJ', 'ANWUJOKX', 'APQGTRMF', 'ATLDMDBO',
       'AXULOHBQ', 'BEZYMBBT', 'BJALXPFS', 'BLANHGYY', 'BSRMSVTC',
       'BTMOEMEP', 'BUZIIBYG', 'CCLHWFWF', 'DBWLZWNK', 'DDEXPPXU',
       'DEWLVASR', 'DJBLNPNC', 'DNMPSKTB', 'DUCMGGNW', 'ELQGMQWE',
       'EVBVXETX', 'FCYBOAXC', 'FDXPMYGF', 'FITUEHWN', 'FQCLOEXA',
       'FRHNWLNI', 'FTSVTTSR', 'FWUCPMMW', 'GIKPEWTY', 'GKQIJYDH',
       'GMBFCMIU', 'GYHKIVQT', 'HCDTGMST', 'HXYSSRXE', 'IGDVVKUD',
       'JBVHSUWY', 'JERHKLYW', 'JPBRUTWP', 'JSLXRQOK', 'KAGJCHMR',
       'KEUWZRKO', 'KFQSHBNH', 'KJNSOAHR', 'LLHREMKS', 'LPHPPJUG',
       'LRVGFDFM', 'LSOIUSXD', 'LTOYKIML', 'LUENUFGA', 'MBSBZBXA',
       'MIRCVAMV', 'NAQIHUKZ', 'NZKBIBNU', 'OGAFEHLU', 'OJIFIHMZ',
       'OKMBFVKS', 'OXKURKXR', 'PLMZALFA', 'PYZMVUWD', 'QJQJSWFU',
       'QKXNTIIB', 'QRKRBYJL', 'QSDCUCLB', 'QVLMOEYE', 'RAWUKQMJ',
       'RCBZUSIM', 'RJCAXSGH', 'RJGPVEXX', 'ROACSLMG', 'RUVXNNVA',
       'SAAYMURU', 'SAHWCZNH', 'SDWAYPIK', 'SNHYOVBM', 'STZDBI

In [195]:
# 1) 기준 아이템 설정
REF_ITEM = "AANGBULD"   # 예: "DXY" 또는 uss_tickers[0]
# target_col = np.append(["ym",REF_ITEM],pairs[pairs["leading_item_id"] == REF_ITEM]["following_item_id"].values)
# target_col = ["ym"]+[i for i in c if c[i][0] == 0]
target_col = np.append(["ym"],sorted_corr[REF_ITEM].index.to_numpy())

uss_df = df_minmax[target_col].copy()

# # R의 structure(names = c("date_id", uss_tickers))와 동일
# uss_df.columns = ["date_id"] + list(uss_tickers)

# pivot_longer → melt
uss_long = uss_df.melt(id_vars="ym", var_name="item", value_name="value")

# 2) 기준 시계열 추출
bench = (
    uss_long.loc[uss_long["item"] == REF_ITEM, ["ym", "value"]]
            .rename(columns={"value": "bench_value"})
)

# 3) 모든 아이템에 기준값 머지
dfc = uss_long.merge(bench, on="ym", how="left")

# 4) '자기 자신' 라인과 '기준' 라인 두 벌로 쌓기
plot_df = pd.concat(
    [
        dfc.assign(series="self",  val=dfc["value"]),
        dfc.assign(series=f"benchmark: {REF_ITEM}", val=dfc["bench_value"])
    ],
    ignore_index=True
)

# 5) 그리기: 패싯은 item으로, 색은 series로
fig = px.line(
    plot_df,
    x="ym",
    y="val",
    color="series",
    facet_col="item",
    facet_col_wrap=6,
    facet_row_spacing=0.02,
    # 원하시면 색을 고정해 가독성을 높일 수 있습니다.
    color_discrete_map={
        "self": "#2962FF",                 # 각 아이템(자기 자신)
        f"benchmark: {REF_ITEM}": "#9E9E9E"  # 기준 라인(회색)
    }
)

# 패싯 라벨 정리 & 범례/레이아웃
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]),)
fig.update_traces(opacity=0.95)
# fig.update_layout(width=1800, height=1400, showlegend=True)
fig.update_layout(showlegend=True,)

# 필요하면 Y축 범위 고정(주의: 기준과 스케일이 다르면 왜곡될 수 있음)
# fig.update_yaxes(range=[0, 150])

fig.show()

In [471]:
# 1) 기준 아이템 설정
# REF_ITEM = "AHMDUILJ"   # 예: "DXY" 또는 uss_tickers[0]
target_col_sub = np.append(["ym",REF_ITEM],sub2[sub2["leading_item_id"] == REF_ITEM]["following_item_id"].values)
# target_col = df_minmax.columns
uss_df = df_minmax[target_col_sub].copy()

# # R의 structure(names = c("date_id", uss_tickers))와 동일
# uss_df.columns = ["date_id"] + list(uss_tickers)

# pivot_longer → melt
uss_long = uss_df.melt(id_vars="ym", var_name="item", value_name="value")

# 2) 기준 시계열 추출
bench = (
    uss_long.loc[uss_long["item"] == REF_ITEM, ["ym", "value"]]
            .rename(columns={"value": "bench_value"})
)

# 3) 모든 아이템에 기준값 머지
dfc = uss_long.merge(bench, on="ym", how="left")

# 4) '자기 자신' 라인과 '기준' 라인 두 벌로 쌓기
plot_df = pd.concat(
    [
        dfc.assign(series="self",  val=dfc["value"]),
        dfc.assign(series=f"benchmark: {REF_ITEM}", val=dfc["bench_value"])
    ],
    ignore_index=True
)

# 5) 그리기: 패싯은 item으로, 색은 series로
fig = px.line(
    plot_df,
    x="ym",
    y="val",
    color="series",
    facet_col="item",
    facet_col_wrap=6,
    facet_row_spacing=0.02,
    # 원하시면 색을 고정해 가독성을 높일 수 있습니다.
    color_discrete_map={
        "self": "#2962FF",                 # 각 아이템(자기 자신)
        f"benchmark: {REF_ITEM}": "#727272"  # 기준 라인(회색)
    }
)

# 패싯 라벨 정리 & 범례/레이아웃
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]),)
fig.update_traces(opacity=0.95)
# fig.update_layout(width=1800, height=1400, showlegend=True)
fig.update_layout(showlegend=True,)

# 필요하면 Y축 범위 고정(주의: 기준과 스케일이 다르면 왜곡될 수 있음)
# fig.update_yaxes(range=[0, 150])

fig.show()

## 4. 회귀 모델 학습
- 시계열 데이터 안에서 '한 달 뒤 총 무역량(value)을 맞추는 문제'로 self-supervised 학습
- 탐색된 모든 공행성쌍 (A,B)에 대해 월 t마다 학습 샘플 생성
- input X:
1) B_t (현재 총 무역량(value))
2) B_{t-1} (직전 달 총 무역량(value))
3) A_{t-lag} (lag 반영된 총 무역량(value))
4) max_corr, best_lag (관계 특성)
- target y:
1) B_{t+1} (다음 달 총 무역량(value))
- 이러한 모든 샘플을 합쳐 LinearRegression 회귀 모델을 학습

In [406]:
def build_training_data(pivot, pairs):
    """
    공행성쌍 + 시계열을 이용해 (X, y) 학습 데이터를 만드는 함수
    input X:
      - b_t, b_t_1, a_t_lag, max_corr, best_lag
    target y:
      - b_t_plus_1
    """
    months = pivot.columns.to_list()
    n_months = len(months)

    rows = []
    vaild_rows = []

    for row in pairs.itertuples(index=False):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)
        n_eff = float(row.n_eff)
        ci_low = float(row.ci_low)
        ci_high = float(row.ci_high)
        ci_diff = float(row.ci_diff)


        if leader not in pivot.index or follower not in pivot.index:
            continue

        a_series = pivot.loc[leader].values.astype(float)
        b_series = pivot.loc[follower].values.astype(float)

        # t+1이 존재하고, t-lag >= 0인 구간만 학습에 사용
        for t in range(max(lag, 1), n_months - 1):
            b_t = b_series[t]
            b_t_1 = b_series[t - 1]
            a_t_lag = a_series[t - lag]
            b_t_plus_1 = b_series[t + 1]
            month = (t % 12)+1
            if t != (n_months-2):
              rows.append({
                  "b_t": b_t,
                  "b_t_1": b_t_1,
                  "a_t_lag": a_t_lag,
                  "max_corr": corr,
                  "best_lag": float(lag),
                  "n_eff":n_eff,
                  "ci_low":ci_low,
                  "ci_high":ci_high,
                  "ci_diff":ci_diff,
                  "month":month,
                  "target": b_t_plus_1,
                  
              })
            else:
              vaild_rows.append({
                  "b_t": b_t,
                  "b_t_1": b_t_1,
                  "a_t_lag": a_t_lag,
                  "max_corr": corr,
                  "best_lag": float(lag),
                  "n_eff":n_eff,
                  "ci_low":ci_low,
                  "ci_high":ci_high,
                  "ci_diff":ci_diff,
                  "month":month,
                  "target": b_t_plus_1,
                  
              })

    df_train = pd.DataFrame(rows)
    df_valid = pd.DataFrame(vaild_rows)
    return df_train, df_valid

# df_train_model = build_training_data(df_minmax.set_index(df_minmax["ym"]).drop("ym",axis=1).T, pairs)
# df_train_model = build_training_data(pivot, pairs)
df_train_model, df_valid_model = build_training_data(pivot, pairs)
print('생성된 학습 데이터의 shape :', df_train_model.shape)
df_train_model.head()

생성된 학습 데이터의 shape : (48850, 11)


Unnamed: 0,b_t,b_t_1,a_t_lag,max_corr,best_lag,n_eff,ci_low,ci_high,ci_diff,month,target
0,582317.0,539873.0,14276.0,-0.459629,5.0,12.937186,-0.807078,0.124264,0.931343,6,759980.0
1,759980.0,582317.0,52347.0,-0.459629,5.0,12.937186,-0.807078,0.124264,0.931343,7,216019.0
2,216019.0,759980.0,53549.0,-0.459629,5.0,12.937186,-0.807078,0.124264,0.931343,8,537693.0
3,537693.0,216019.0,0.0,-0.459629,5.0,12.937186,-0.807078,0.124264,0.931343,9,205326.0
4,205326.0,537693.0,26997.0,-0.459629,5.0,12.937186,-0.807078,0.124264,0.931343,10,169440.0


In [407]:
df_train_model.corr()

Unnamed: 0,b_t,b_t_1,a_t_lag,max_corr,best_lag,n_eff,ci_low,ci_high,ci_diff,month,target
b_t,1.0,0.934638,0.005552,0.03268,-0.026797,-0.144977,-0.163183,0.135421,0.209858,-0.006488,0.935973
b_t_1,0.934638,1.0,0.005313,0.034563,-0.028466,-0.145893,-0.164325,0.137368,0.211986,-0.009415,0.930871
a_t_lag,0.005552,0.005313,1.0,0.062621,0.018803,-0.151545,-0.145267,0.161086,0.213532,0.001957,0.003532
max_corr,0.03268,0.034563,0.062621,1.0,-0.056819,-0.055819,0.474163,0.612618,0.053307,-0.001276,0.030476
best_lag,-0.026797,-0.028466,0.018803,-0.056819,1.0,0.035447,0.019435,-0.057995,-0.052586,0.032558,-0.026675
n_eff,-0.144977,-0.145893,-0.151545,-0.055819,0.035447,1.0,0.529292,-0.554774,-0.756826,0.001363,-0.144228
ci_low,-0.163183,-0.164325,-0.145267,0.474163,0.019435,0.529292,1.0,-0.019893,-0.7522,0.001568,-0.1624
ci_high,0.135421,0.137368,0.161086,0.612618,-0.057995,-0.554774,-0.019893,1.0,0.673768,-0.001837,0.133163
ci_diff,0.209858,0.211986,0.213532,0.053307,-0.052586,-0.756826,-0.7522,0.673768,1.0,-0.00237,0.207791
month,-0.006488,-0.009415,0.001957,-0.001276,0.032558,0.001363,0.001568,-0.001837,-0.00237,1.0,0.009048


In [408]:
df_valid_model

Unnamed: 0,b_t,b_t_1,a_t_lag,max_corr,best_lag,n_eff,ci_low,ci_high,ci_diff,month,target
0,32430.0,3543.0,25691.0,-0.459629,5.0,12.937186,-0.807078,0.124264,0.931343,6,40608.0
1,65475.0,112061.0,0.0,-0.478544,1.0,7.361921,-0.897563,0.394697,1.292260,6,41989.0
2,127236.0,190396.0,25805.0,-0.406658,4.0,42.345991,-0.631594,-0.118577,0.513016,6,539041.0
3,408887.0,361589.0,26507.0,0.404549,6.0,28.058554,0.037525,0.675402,0.637878,6,482787.0
4,5676571.0,5318257.0,25805.0,-0.420569,4.0,6.568085,-0.902583,0.529333,1.431916,6,4507669.0
...,...,...,...,...,...,...,...,...,...,...,...
1302,403841.0,521764.0,19599.0,0.436981,5.0,38.549889,0.138868,0.662477,0.523609,6,314413.0
1303,109377.0,2553.0,12817.0,-0.432459,3.0,70.463865,-0.605345,-0.220607,0.384737,6,22722.0
1304,87450.0,34669.0,18224.0,-0.489604,1.0,54.905407,-0.668256,-0.257561,0.410695,6,123581.0
1305,421272.0,489725.0,12817.0,-0.414658,3.0,28.768742,-0.679037,-0.055067,0.623971,6,561693.0


In [64]:
def build_training_data_safe(pivot, pairs, window=24):
    months = pivot.columns.to_list()
    n_months = len(months)
    rows = []
    for row in pairs.itertuples(index=False):
        leader, follower, lag = row.leading_item_id, row.following_item_id, int(row.best_lag)
        if leader not in pivot.index or follower not in pivot.index: 
            continue
        a = pivot.loc[leader].values.astype(float)
        b = pivot.loc[follower].values.astype(float)

        # t 시점 예측에서: [t-window+1, t] 만 사용해 lag/corr 등의 요약을 계산
        for t in range(max(lag, 1, window-1), n_months-1):
            past_a = a[t-window+1:t+1]
            past_b = b[t-window+1:t+1]

            # 과거 구간에서의 상관(안정화/클리핑 권장)
            # lag 적용 후 과거-내 상관
            if lag < len(past_a):
                r = np.corrcoef(past_a[:-lag], past_b[lag:])[0,1]
            else:
                r = np.nan

            rows.append({
                "b_t": b[t],
                "b_t_1": b[t-1],
                "a_t_lag": a[t-lag],
                "corr_past": np.nan_to_num(r, nan=0.0, posinf=0.0, neginf=0.0),
                "best_lag": float(lag),       # pairs에서 가져오되, 전기간 산출치면 사용 자제/고정
                "target": b[t+1],
            })
    return pd.DataFrame(rows)
# df_train_model = build_training_data_safe(pivot, pairs)
# print('생성된 학습 데이터의 shape :', df_train_model.shape)
# df_train_model.head()

In [200]:
df_train_model.columns

Index(['b_t', 'b_t_1', 'a_t_lag', 'max_corr', 'best_lag', 'n_eff', 'ci_low',
       'ci_high', 'ci_diff', 'month', 'target'],
      dtype='object')

In [409]:
# 회귀모델 학습
# feature_cols = ['b_t', 'b_t_1', 'a_t_lag', 'max_corr', 'best_lag', 'n_eff', 'ci_low', 'ci_high','ci_diff','month']

train_X = df_train_model.drop("target",axis=1)
train_y = df_train_model["target"].values
valid_X = df_valid_model.drop("target",axis=1)
valid_y = df_valid_model["target"].values

In [410]:
reg = LinearRegression()
reg.fit(train_X, train_y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [412]:
model_cat = CatBoostRegressor(n_estimators=10000, random_seed=42,learning_rate=0.005)
model_cat.fit(train_X, train_y, eval_set=(valid_X,valid_y),early_stopping_rounds=50)

0:	learn: 13288943.7897034	test: 12414657.2783873	best: 12414657.2783873 (0)	total: 14.1ms	remaining: 2m 20s
1:	learn: 13232570.1984580	test: 12367446.0520201	best: 12367446.0520201 (1)	total: 22.7ms	remaining: 1m 53s
2:	learn: 13174715.7659614	test: 12319025.7305005	best: 12319025.7305005 (2)	total: 33ms	remaining: 1m 49s
3:	learn: 13117423.2012240	test: 12276110.5356674	best: 12276110.5356674 (3)	total: 40.3ms	remaining: 1m 40s
4:	learn: 13060013.8639742	test: 12230748.6091491	best: 12230748.6091491 (4)	total: 46.1ms	remaining: 1m 32s
5:	learn: 13003121.9865948	test: 12188985.0559913	best: 12188985.0559913 (5)	total: 54.5ms	remaining: 1m 30s
6:	learn: 12947741.8889423	test: 12148478.9042905	best: 12148478.9042905 (6)	total: 62ms	remaining: 1m 28s
7:	learn: 12890870.7585558	test: 12105727.5392973	best: 12105727.5392973 (7)	total: 67.3ms	remaining: 1m 24s
8:	learn: 12835472.2603001	test: 12061277.7939029	best: 12061277.7939029 (8)	total: 72.5ms	remaining: 1m 20s
9:	learn: 12781638.0798

<catboost.core.CatBoostRegressor at 0x25e8ed84a10>

In [None]:
model = lgb.LGBMRegressor()
model.fit(train_X, train_y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000676 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1792
[LightGBM] [Info] Number of data points in the train set: 88977, number of used features: 8
[LightGBM] [Info] Start training from score 4687838.283815


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


## 5. 회귀 모델 추론 및 제출(submission) 파일 생성
- 탐색된 공행성 쌍에 대해 후행 품목(following_item_id)에 대한 2025년 8월 총 무역량(value) 예측

In [414]:
def predict(pivot, pairs, reg):
    months = pivot.columns.to_list()
    n_months = len(months)

    # 가장 마지막 두 달 index (2025-7, 2025-6)
    t_last = n_months - 1
    t_prev = n_months - 2

    preds = []

    for row in tqdm(pairs.itertuples(index=False)):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)
        n_eff = float(row.n_eff)
        ci_low = float(row.ci_low)
        ci_high = float(row.ci_high)
        ci_diff = float(row.ci_diff)
        month = int(7)

        if leader not in pivot.index or follower not in pivot.index:
            continue

        a_series = pivot.loc[leader].values.astype(float)
        b_series = pivot.loc[follower].values.astype(float)

        # t_last - lag 가 0 이상인 경우만 예측
        if t_last - lag < 0:
            continue

        b_t = b_series[t_last]
        b_t_1 = b_series[t_prev]
        a_t_lag = a_series[t_last - lag]

        X_test = np.array([[b_t, b_t_1, a_t_lag, corr, float(lag),n_eff,ci_low,ci_high,ci_diff,month]])
        # X_test = np.array([[b_t, b_t_1, a_t_lag, corr, float(lag),month]])
        y_pred = reg.predict(X_test)[0]
        # (후처리 1) 음수 예측 → 0으로 변환
        # (후처리 2) 소수점 → 정수 변환 (무역량은 정수 단위)
        y_pred = max(0.0, float(y_pred))
        y_pred = int(round(y_pred))

        preds.append({
            "leading_item_id": leader,
            "following_item_id": follower,
            "value": y_pred,
        })

    df_pred = pd.DataFrame(preds)
    return df_pred

In [47]:
def _corr_in_past_window(a_series, b_series, t_last, window, lag):
    """
    t_last 시점까지의 과거 구간[t_last-window+1, t_last]만 사용하여
    lag을 적용한 (a→b) 상관을 계산합니다.
    """
    # 윈도우 시작/끝 인덱스
    start = max(0, t_last - window + 1)
    end = t_last  # inclusive

    past_a = a_series[start:end+1]
    past_b = b_series[start:end+1]
    # lag을 적용하려면 최소 길이가 lag+2 이상이어야 유효(상관 최소 2포인트)
    if len(past_a) <= lag + 1:
        return 0.0

    x = past_a[:-lag]   # a_{t-lag}
    y = past_b[lag:]    # b_t

    # 길이 확인
    n = min(len(x), len(y))
    if n < 2:
        return 0.0

    # 상관 계산(안정화)
    try:
        r = float(np.corrcoef(x[:n], y[:n])[0, 1])
        if not np.isfinite(r):
            r = 0.0
    except Exception:
        r = 0.0
    return r


def predict_safe(pivot, pairs, reg, window=24, clip_negative_to_zero=True, round_to_int=True):
    """
    안전한(누수 방지) 피처셋으로 예측.
    - 입력 피처: b_t, b_t_1, a_t_lag, corr_past, best_lag
    - corr_past: t_last 까지의 과거 window만으로 계산
    - best_lag: pairs에서 고정 사용(단, 전기간 산출치라면 학습과 동일 규칙 유지 필수)

    반환: follower(타겟)별 예측값 집계 DataFrame
    """
    months = pivot.columns.to_list()
    n_months = len(months)

    # 예측 기준 시점(마지막 관측월의 다음 달을 예측한다고 가정)
    t_last = n_months - 1      # 마지막 관측월 인덱스
    t_prev = n_months - 2      # 그 이전 월

    preds = []

    for row in tqdm(pairs.itertuples(index=False)):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)

        if leader not in pivot.index or follower not in pivot.index:
            continue

        a_series = pivot.loc[leader].values.astype(float)
        b_series = pivot.loc[follower].values.astype(float)

        # 필요한 값이 존재하는지(인덱스 범위) 체크
        if t_last - lag < 0 or t_prev < 0:
            continue

        b_t     = b_series[t_last]
        b_t_1   = b_series[t_prev]
        a_t_lag = a_series[t_last - lag]

        # 과거 윈도우에서의 상관
        corr_past = _corr_in_past_window(a_series, b_series, t_last, window, lag)

        X_test = np.array([[b_t, b_t_1, a_t_lag, corr_past, float(lag)]])
        y_hat = float(reg.predict(X_test)[0])

        # 후처리(학습 시와 동일 규칙 유지)
        if clip_negative_to_zero:
            y_hat = max(0.0, y_hat)
        if round_to_int:
            y_hat = int(round(y_hat))

        preds.append({
            "leading_item_id": leader,
            "following_item_id": follower,
            "value": y_hat,
            "b_t": b_t,
            "b_t_1": b_t_1,
            "a_t_lag": a_t_lag,
            "corr_past": corr_past,
            "best_lag": lag,
        })

    df_pred = pd.DataFrame(preds)

    # 같은 follower(타겟)에 여러 페어가 있을 수 있으니, 집계 방식 선택:
    # 1) 최대값(보수적), 2) 평균값(부드럽게), 3) 가중평균(|corr_past| 가중치) 등
    if not df_pred.empty:
        # 예: |corr_past| 가중 평균
        df_pred["abs_w"] = df_pred["corr_past"].abs().replace(0, 1e-6)
        agg = (df_pred
               .groupby("following_item_id", as_index=False)
               .apply(lambda g: pd.Series({
                   "value_pred_weighted": int(round(np.average(g["value"].astype(float), weights=g["abs_w"]))),
                   "value_pred_mean":     int(round(g["value"].mean())),
                   "value_pred_max":      int(round(g["value"].max())),
                   "num_pairs":           int(len(g))
               }))
              )
        # 원시 행도 함께 보려면 df_pred를 그대로 반환하거나, (agg, df_pred) 튜플 반환 가능
        return agg

    return df_pred  # 빈 경우 원시 반환

In [415]:
submission = predict(pivot, pairs, model_cat)
# submission = predict(pivot, pairs, reg)
submission.head()

1307it [00:01, 700.20it/s]


Unnamed: 0,leading_item_id,following_item_id,value
0,AANGBULD,APQGTRMF,144972
1,AANGBULD,AXULOHBQ,170930
2,AANGBULD,BSRMSVTC,334741
3,AANGBULD,DEWLVASR,404046
4,AANGBULD,DNMPSKTB,5276421


In [7]:
9900*(0.2/(2-0.2))

1100.0

In [2]:
0.2432792121666667
0.3632637166666667


0.3632637166666667

In [418]:
submission.to_csv('./baseline_cat_valid_1307_f10_1120.csv', index=False)

In [8]:
sub1 = pd.read_csv("baseline_cat1120.csv")
sub2 = pd.read_csv("baseline_submit4.csv")
sub3 = pd.read_csv("baseline_submit.csv")

NameError: name 'pd' is not defined

In [266]:
c, p, n ,d = count_p(sub1,sub2)
p, n ,d

(0, 2327, -2327)

In [267]:
p+n

2327

In [268]:
c, p, n ,d = count_p(sub2,sub3)
p, n ,d

(1445, 644, 801)

In [269]:
p+n

2089

In [270]:
c, p, n ,d = count_p(sub1,sub3)
p, n ,d

(126, 1652, -1526)

In [271]:
p+n

1778

In [272]:
c, p, n ,d = count_p(sub2,submission)
p, n ,d

(2708, 263, 2445)

In [273]:
p+n

2971

In [274]:
c, p, n ,d = count_p(sub1,submission)
p, n ,d

(696, 578, 118)

In [275]:
p+n

1274

In [276]:
c, p, n ,d = count_p(sub3,submission)
p, n ,d

(1913, 269, 1644)

In [277]:
p+n

2182

In [278]:
c

{'AANGBULD': [35, 21, 14, 21, 7],
 'AHMDUILJ': [33, 4, 3, 30, 1],
 'APQGTRMF': [46, 24, 22, 24, 2],
 'ATLDMDBO': [51, 34, 32, 19, 2],
 'AXULOHBQ': [40, 11, 10, 30, 1],
 'BEZYMBBT': [43, 18, 16, 27, 2],
 'BJALXPFS': [40, 7, 7, 33, 0],
 'BLANHGYY': [38, 3, 2, 36, 1],
 'BSRMSVTC': [35, 16, 12, 23, 4],
 'BTMOEMEP': [44, 26, 22, 22, 4],
 'BUZIIBYG': [23, 4, 4, 19, 0],
 'CCLHWFWF': [33, 9, 9, 24, 0],
 'DBWLZWNK': [47, 16, 15, 32, 1],
 'DDEXPPXU': [18, 3, 2, 16, 1],
 'DEWLVASR': [46, 15, 12, 34, 3],
 'DJBLNPNC': [0, 8, 0, 0, 8],
 'DNMPSKTB': [49, 35, 33, 16, 2],
 'DUCMGGNW': [23, 2, 2, 21, 0],
 'ELQGMQWE': [45, 10, 10, 35, 0],
 'EVBVXETX': [36, 13, 12, 24, 1],
 'FCYBOAXC': [25, 7, 7, 18, 0],
 'FDXPMYGF': [27, 15, 4, 23, 11],
 'FITUEHWN': [21, 7, 6, 15, 1],
 'FQCLOEXA': [38, 18, 17, 21, 1],
 'FRHNWLNI': [35, 8, 6, 29, 2],
 'FTSVTTSR': [20, 12, 1, 19, 11],
 'FWUCPMMW': [23, 6, 4, 19, 2],
 'GKQIJYDH': [19, 4, 0, 19, 4],
 'GYHKIVQT': [53, 31, 31, 22, 0],
 'HCDTGMST': [30, 23, 13, 17, 10],
 'HXYSS