In [1]:
import pandas as pd
import os

def calculate_timeliness_from_files(base_dir, num_phases=4):
    """
    Tính timeliness của từng user từ các file phase riêng biệt.
    
    base_dir: thư mục gốc chứa các thư mục phaseX
    num_phases: số lượng phase (mặc định 4)

    Trả về:
        df_final: DataFrame chứa user_id, timeliness theo từng phase, và trung bình timeliness
    """
    timeliness_dfs = []

    for phase in range(1, num_phases + 1):
        file_path = os.path.join(base_dir, f'phase{phase}', f'user_train_phase_{phase}.csv')
        
        if not os.path.exists(file_path):
            print(f"[!] Không tìm thấy file {file_path}. Bỏ qua Phase {phase}.")
            continue

        df = pd.read_csv(file_path)
        
        if 'user_id' not in df.columns:
            raise ValueError(f"[!] File {file_path} không chứa cột 'user_id'.")

        # Lấy các cột hành vi cần kiểm tra trong phase này (có hậu tố _<phase> hoặc chứa 'phaseX')
        behavior_cols = [col for col in df.columns if col.endswith(f"_{phase}") or f"phase{phase}" in col]

        if not behavior_cols:
            print(f"[!] Không tìm thấy cột hành vi trong {file_path}.")
            df[f'timeliness_phase{phase}'] = pd.NA
        else:
            timely_flags = ~df[behavior_cols].isnull().any(axis=1)
            df[f'timeliness_phase{phase}'] = timely_flags.astype(int)

        # Giữ lại user_id và timeliness
        timeliness_df = df[['user_id', 'course_id', f'timeliness_phase{phase}']]
        timeliness_dfs.append(timeliness_df)

    # Gộp tất cả các phase theo user_id, course_id
    df_final = timeliness_dfs[0]
    for df in timeliness_dfs[1:]:
        df_final = df_final.merge(df, on=['user_id', 'course_id'], how='outer')

    # Tính trung bình timeliness
    phase_cols = [f'timeliness_phase{p}' for p in range(1, num_phases + 1)]
    df_final['timeliness_avg'] = df_final[phase_cols].mean(axis=1, skipna=True)

    return df_final


In [2]:
# Gọi hàm với thư mục chứa dữ liệu của bạn
final_timeliness_df = calculate_timeliness_from_files('/kaggle/input/final-data', num_phases=4)

# Xem kết quả
print(final_timeliness_df.head())

# Tính timeliness trung bình toàn dataset
print("Timeliness trung bình toàn dataset:", final_timeliness_df['timeliness_avg'].mean())


     user_id  course_id  timeliness_phase1  timeliness_phase2  \
0    U_10000  C_2033958                  1                1.0   
1  U_1000979   C_947149                  1                1.0   
2  U_1000982   C_947149                  1                1.0   
3  U_1001176   C_947149                  1                1.0   
4  U_1001413   C_735164                  1                1.0   

   timeliness_phase3  timeliness_phase4  timeliness_avg  
0                1.0                1.0             1.0  
1                1.0                1.0             1.0  
2                1.0                NaN             1.0  
3                1.0                1.0             1.0  
4                NaN                NaN             1.0  
Timeliness trung bình toàn dataset: 1.0


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [3]:
final_timeliness_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,user_id,course_id,timeliness_phase1,timeliness_phase2,timeliness_phase3,timeliness_phase4,timeliness_avg
0,U_10000,C_2033958,1,1.0,1.0,1.0,1.0
1,U_1000979,C_947149,1,1.0,1.0,1.0,1.0
2,U_1000982,C_947149,1,1.0,1.0,,1.0
3,U_1001176,C_947149,1,1.0,1.0,1.0,1.0
4,U_1001413,C_735164,1,1.0,,,1.0
...,...,...,...,...,...,...,...
108117,U_99746,C_674971,1,1.0,1.0,1.0,1.0
108118,U_997506,C_2095102,1,1.0,1.0,1.0,1.0
108119,U_99753,C_1428968,1,1.0,1.0,1.0,1.0
108120,U_997542,C_2066096,1,1.0,1.0,1.0,1.0


In [4]:
# Lấy danh sách cột timeliness theo phase
phase_cols = ['timeliness_phase1', 'timeliness_phase2', 'timeliness_phase3', 'timeliness_phase4']

# Tính trung bình cho mỗi phase (bỏ qua NaN)
timeliness_means = final_timeliness_df[phase_cols].mean()

print("Trung bình timeliness theo từng phase:")
timeliness_means

Trung bình timeliness theo từng phase:


timeliness_phase1    1.0
timeliness_phase2    1.0
timeliness_phase3    1.0
timeliness_phase4    1.0
dtype: float64