# Future Information Leakage 검사

이 노트북은 train_ratings.csv에서 user가 클릭한 시각이 item의 개봉년도보다 이전인 경우를 확인합니다.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# 한글 폰트 설정 (선택사항)
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

# 스타일 설정
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## train data 파일 검사
### 1. 데이터 로드

In [2]:
# 파일 경로
ratings_path = '~/data/train/train_ratings.csv'
years_path = '~/data/train/years.tsv'

# 데이터 로드
print("Loading data...")
ratings_df = pd.read_csv(ratings_path)
years_df = pd.read_csv(years_path, sep='\t')

print(f"\nRatings data shape: {ratings_df.shape}")
print(f"Years data shape: {years_df.shape}")

print("\nRatings data:")
ratings_df.head()

Loading data...

Ratings data shape: (5154471, 3)
Years data shape: (6799, 2)

Ratings data:


Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563


In [3]:
print("Years data:")
years_df.head()

Years data:


Unnamed: 0,item,year
0,1348,1922
1,44587,1922
2,4768,1922
3,8235,1923
4,8609,1923


### 2. 데이터 전처리

In [4]:
# timestamp를 datetime으로 변환
ratings_df['datetime'] = pd.to_datetime(ratings_df['time'], unit='s')
ratings_df['click_year'] = ratings_df['datetime'].dt.year

print(f"Click year range: {ratings_df['click_year'].min()} ~ {ratings_df['click_year'].max()}")
print(f"Item release year range: {years_df['year'].min()} ~ {years_df['year'].max()}")

# 예시 데이터
ratings_df[['user', 'item', 'time', 'datetime', 'click_year']].head(10)

Click year range: 2005 ~ 2015
Item release year range: 1922 ~ 2014


Unnamed: 0,user,item,time,datetime,click_year
0,11,4643,1230782529,2009-01-01 04:02:09,2009
1,11,170,1230782534,2009-01-01 04:02:14,2009
2,11,531,1230782539,2009-01-01 04:02:19,2009
3,11,616,1230782542,2009-01-01 04:02:22,2009
4,11,2140,1230782563,2009-01-01 04:02:43,2009
5,11,2722,1230782583,2009-01-01 04:03:03,2009
6,11,2313,1230782646,2009-01-01 04:04:06,2009
7,11,2688,1230782656,2009-01-01 04:04:16,2009
8,11,2428,1230782694,2009-01-01 04:04:54,2009
9,11,3113,1230782719,2009-01-01 04:05:19,2009


In [5]:
# item의 개봉년도 정보 병합
merged_df = ratings_df.merge(years_df, on='item', how='left')

print(f"\nMerged data shape: {merged_df.shape}")
print(f"Missing year values: {merged_df['year'].isna().sum()} ({merged_df['year'].isna().sum() / len(merged_df) * 100:.2f}%)")

merged_df.head(10)


Merged data shape: (5154471, 6)
Missing year values: 1832 (0.04%)


Unnamed: 0,user,item,time,datetime,click_year,year
0,11,4643,1230782529,2009-01-01 04:02:09,2009,2001.0
1,11,170,1230782534,2009-01-01 04:02:14,2009,1995.0
2,11,531,1230782539,2009-01-01 04:02:19,2009,1993.0
3,11,616,1230782542,2009-01-01 04:02:22,2009,1970.0
4,11,2140,1230782563,2009-01-01 04:02:43,2009,1982.0
5,11,2722,1230782583,2009-01-01 04:03:03,2009,1999.0
6,11,2313,1230782646,2009-01-01 04:04:06,2009,1980.0
7,11,2688,1230782656,2009-01-01 04:04:16,2009,1999.0
8,11,2428,1230782694,2009-01-01 04:04:54,2009,1998.0
9,11,3113,1230782719,2009-01-01 04:05:19,2009,1999.0


### 3. Future Information Leakage 검사

In [6]:
# year 정보가 있는 데이터만 필터링
valid_df = merged_df[merged_df['year'].notna()].copy()
print(f"Valid data (with year info): {len(valid_df):,} rows")

# Future information leakage 검사: 클릭 년도 < 개봉 년도
valid_df['is_leakage'] = valid_df['click_year'] < valid_df['year']

# 통계 출력
total_leakage = valid_df['is_leakage'].sum()
total_valid = len(valid_df)
leakage_ratio = total_leakage / total_valid * 100

print(f"\n{'='*60}")
print(f"FUTURE INFORMATION LEAKAGE SUMMARY")
print(f"{'='*60}")
print(f"Total interactions (with year info): {total_valid:,}")
print(f"Leakage cases: {total_leakage:,}")
print(f"Leakage ratio: {leakage_ratio:.4f}%")
print(f"{'='*60}")

Valid data (with year info): 5,152,639 rows

FUTURE INFORMATION LEAKAGE SUMMARY
Total interactions (with year info): 5,152,639
Leakage cases: 3
Leakage ratio: 0.0001%


In [7]:
# Leakage 샘플 확인
print("\nSample leakage cases:")
leakage_samples = valid_df[valid_df['is_leakage']][['user', 'item', 'click_year', 'year', 'datetime']].head(20)
leakage_samples['year_diff'] = leakage_samples['year'] - leakage_samples['click_year']
leakage_samples.sort_values('year_diff', ascending=False)


Sample leakage cases:


Unnamed: 0,user,item,click_year,year,datetime,year_diff
3064009,81663,91535,2011,2012.0,2011-12-07 02:00:07,1.0
4617327,123609,89745,2011,2012.0,2011-09-21 22:47:54,1.0
4646140,124413,99007,2012,2013.0,2012-12-18 00:50:16,1.0


### 4. 최종 요약

In [16]:
print("\n" + "="*80)
print("FINAL SUMMARY: FUTURE INFORMATION LEAKAGE ANALYSIS")
print("="*80)

print(f"\n1. Overall Statistics:")
print(f"   - Total interactions: {len(ratings_df):,}")
print(f"   - Interactions with year info: {len(valid_df):,} ({len(valid_df)/len(ratings_df)*100:.2f}%)")
print(f"   - Leakage cases: {total_leakage:,}")
print(f"   - Leakage ratio: {leakage_ratio:.4f}%")

print(f"\n2. User-wise Analysis:")
print(f"   - Total users: {len(user_leakage):,}")
print(f"   - Users with leakage: {len(user_leakage_filtered):,} ({len(user_leakage_filtered)/len(user_leakage)*100:.2f}%)")
print(f"   - Avg leakage per user (with leakage): {user_leakage_filtered['leakage_count'].mean():.2f}")
print(f"   - Max leakage per user: {user_leakage_filtered['leakage_count'].max()}")

print(f"\n3. Click Year Analysis:")
print(f"   - Click year range: {valid_df['click_year'].min()} ~ {valid_df['click_year'].max()}")
print(f"   - Years with leakage: {year_leakage[year_leakage['leakage_count'] > 0]['click_year'].tolist()}")

print(f"\n4. Release Year Analysis:")
print(f"   - Release years with leakage: {len(release_year_leakage_filtered)}")
print(f"   - Top 5 release years:")
for idx, row in release_year_leakage_filtered.sort_values('leakage_count', ascending=False).head(5).iterrows():
    print(f"     {int(row['release_year'])}: {int(row['leakage_count']):,} cases ({row['leakage_ratio']:.2f}%)")

print(f"\n5. Year Difference Analysis:")
print(f"   - Mean difference: {leakage_df['year_diff'].mean():.2f} years")
print(f"   - Median difference: {leakage_df['year_diff'].median():.2f} years")
print(f"   - Max difference: {leakage_df['year_diff'].max()} years")

print("\n" + "="*80)


FINAL SUMMARY: FUTURE INFORMATION LEAKAGE ANALYSIS

1. Overall Statistics:
   - Total interactions: 5,154,471
   - Interactions with year info: 5,152,639 (99.96%)
   - Leakage cases: 3
   - Leakage ratio: 0.0001%

2. User-wise Analysis:
   - Total users: 31,360
   - Users with leakage: 3 (0.01%)
   - Avg leakage per user (with leakage): 1.00
   - Max leakage per user: 1

3. Click Year Analysis:
   - Click year range: 2005 ~ 2015
   - Years with leakage: [2011, 2012]

4. Release Year Analysis:
   - Release years with leakage: 2
   - Top 5 release years:
     2012: 2 cases (0.00%)
     2013: 1 cases (0.00%)

5. Year Difference Analysis:
   - Mean difference: 1.00 years
   - Median difference: 1.00 years
   - Max difference: 1.0 years



## 기존에 생성한 submission 파일 검사

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

plt.rc('font', family='NanumGothic')
plt.rc('axes', unicode_minus=False)

sns.set_style('darkgrid')


### 1. 데이터 로드

In [12]:
# 파일 경로
submission_paths = [
    #"~/juik/lightning/saved/hydra_logs/bert4rec/2025-12-24/11-47-42/submissions/bert4rec_predictions_10.csv",
    # "~/juik/lightning/saved/hydra_logs/multi-vae/2025-12-24/11-12-23/submissions/multi_vae_predictions_10.csv",
    # "~/juik/lightning/saved/hydra_logs/multi-vae/2025-12-23/10-32-13/submissions/multi_vae_predictions_10.csv",
    "~/juik/lightning/saved/zz/output_1671.csv",
    "~/juik/lightning/saved/zz/output_ease_1600.csv",
]
ratings_path = "~/data/train/train_ratings.csv"
years_path = "~/data/train/years.tsv"

# 데이터 로드
print("Loading data...")
submission_dfs = [pd.read_csv(path) for path in submission_paths ]
ratings_df = pd.read_csv(ratings_path)
years_df = pd.read_csv(years_path, sep="\t")

print(f"\nSubmisstion data shape: {[df.shape for df in submission_dfs]}")
print(f"Rating data shape: {ratings_df.shape}")
print(f"Years data shape: {years_df.shape}")

print("\nSubmission data:")
[print(df.head()) for df in submission_dfs]

print("\nRating data:")
print(ratings_df.head())

print("\nYear data:")
print(years_df.head())

Loading data...

Submisstion data shape: [(313600, 2), (313600, 2)]
Rating data shape: (5154471, 3)
Years data shape: (6799, 2)

Submission data:
   user   item
0    11  50068
1    11   8950
2    11   4370
3    11   4886
4    11  40815
   user   item
0    11   4370
1    11   4886
2    11  40815
3    11     47
4    11  32587

Rating data:
   user  item        time
0    11  4643  1230782529
1    11   170  1230782534
2    11   531  1230782539
3    11   616  1230782542
4    11  2140  1230782563

Year data:
    item  year
0   1348  1922
1  44587  1922
2   4768  1922
3   8235  1923
4   8609  1923


### 2. 데이터 전처리
* submission에 release_year 추가
* rating에 click_year 추가
* user별 last_click_year dictionary 생성
* submission에 last_click_year 추가

In [13]:
# submission에 year 추가
merged_sub_dfs = [df.merge(years_df, on="item", how="left") for df in submission_dfs]

for df in merged_sub_dfs:
    print(f"\nMerged submission data shape: {df.shape}")
    print(
        f"Missing year values: {df['year'].isna().sum()} ({df['year'].isna().sum() / len(df) * 100:.2f}%)"
    )
    print(df.head())

# rating의 timestamp를 datetime으로 변환
ratings_df["datetime"] = pd.to_datetime(ratings_df["time"], unit="s")
ratings_df["click_year"] = ratings_df["datetime"].dt.year

print(
    f"Click year range: {ratings_df['click_year'].min()} ~ {ratings_df['click_year'].max()}"
)
print(f"Item release year range: {years_df['year'].min()} ~ {years_df['year'].max()}")

# 예시 데이터
print("Ratins data:")
print(ratings_df.head())

# user별 last_click_year 생성
last_click_year_df = (
    ratings_df.groupby("user")["click_year"].max().to_frame("last_click_year")
)
print("last_click_year data:")
print(last_click_year_df.head())

# submission 에 las_click_year 추가
merged_sub_dfs = [
    pd.merge(df, last_click_year_df, right_index=True, left_on="user", how="left")
    for df in merged_sub_dfs
]
for df in merged_sub_dfs:
    print(f"\nMerged submission data shape: {df.shape}")
    print(
        f"Missing last_click_year values: {df['last_click_year'].isna().sum()} ({df['last_click_year'].isna().sum() / len(df) * 100:.2f}%)"
    )
    print(df.head())


Merged submission data shape: (313600, 3)
Missing year values: 98 (0.03%)
   user   item    year
0    11  50068  2006.0
1    11   8950  2004.0
2    11   4370  2001.0
3    11   4886  2001.0
4    11  40815  2005.0

Merged submission data shape: (313600, 3)
Missing year values: 14 (0.00%)
   user   item    year
0    11   4370  2001.0
1    11   4886  2001.0
2    11  40815  2005.0
3    11     47  1995.0
4    11  32587  2005.0
Click year range: 2005 ~ 2015
Item release year range: 1922 ~ 2014
Ratins data:
   user  item        time            datetime  click_year
0    11  4643  1230782529 2009-01-01 04:02:09        2009
1    11   170  1230782534 2009-01-01 04:02:14        2009
2    11   531  1230782539 2009-01-01 04:02:19        2009
3    11   616  1230782542 2009-01-01 04:02:22        2009
4    11  2140  1230782563 2009-01-01 04:02:43        2009
last_click_year data:
      last_click_year
user                 
11               2011
14               2008
18               2010
25            

### 3. Future Information Leakage 검사
* leakage 여부 추가하기

In [14]:
# leakage 여부 추가하기
for df in merged_sub_dfs:
    df['leakage'] = df['year'] > df['last_click_year']
    print(f"Leakage 발생 건수: {df['leakage'].sum()}({df['leakage'].sum()/len(df) * 100:.2f}%))")
    print(df[df['leakage']].head(100))

Leakage 발생 건수: 360(0.11%))
        user   item    year  last_click_year  leakage
2146     915  54286  2007.0             2006     True
3149    1370  58559  2008.0             2007     True
3505    1539  79132  2010.0             2009     True
4630    2008  86882  2011.0             2010     True
4941    2128  88125  2011.0             2010     True
...      ...    ...     ...              ...      ...
82488  36098  69844  2009.0             2008     True
83311  36442  59315  2008.0             2007     True
83391  36482  89745  2012.0             2011     True
83560  36554  88125  2011.0             2010     True
84265  36835  79132  2010.0             2009     True

[100 rows x 5 columns]
Leakage 발생 건수: 224(0.07%))
         user   item    year  last_click_year  leakage
109        61  58559  2008.0             2007     True
389       182  89745  2012.0             2011     True
2144      915  54286  2007.0             2006     True
3148     1370  58559  2008.0             2007     True