In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Chia d·ªØ li·ªáu th√†nh c√°c file ri√™ng theo cluster v√† userid
========================================================
L·ªçc 2 file CSV (grades v√† logs) theo userid t·ª´ cluster_userids.json
v√† xu·∫•t ra c√°c file ri√™ng cho m·ªói cluster v√† m·ªói user.
"""

import json
import pandas as pd
from pathlib import Path
from typing import Dict, List

class DataSplitter:
    """Class ƒë·ªÉ chia d·ªØ li·ªáu theo cluster"""
    
    def __init__(self, cluster_file: str, output_dir: str = None):
        """
        Parameters:
        -----------
        cluster_file : str
            ƒê∆∞·ªùng d·∫´n t·ªõi file cluster_userids.json
        output_dir : str
            Th∆∞ m·ª•c output (m·∫∑c ƒë·ªãnh: c√πng th∆∞ m·ª•c v·ªõi cluster_file)
        """
        self.cluster_file = Path(cluster_file)
        self.output_dir = Path(output_dir) if output_dir else self.cluster_file.parent / "split_by_cluster"
        self.output_dir.mkdir(exist_ok=True, parents=True)
        
        # Load cluster userids
        with open(self.cluster_file, 'r') as f:
            self.cluster_data = json.load(f)
        
        # T·∫°o mapping ng∆∞·ª£c: userid -> cluster_id
        self.userid_to_cluster = {}
        for cluster_id, userids in self.cluster_data.items():
            for uid in userids:
                self.userid_to_cluster[uid] = cluster_id
        
        # All valid userids
        self.valid_userids = list(self.userid_to_cluster.keys())
        
        print(f"‚úì Loaded {len(self.valid_userids)} userids from {len(self.cluster_data)} clusters")
        for cluster_id, userids in self.cluster_data.items():
            print(f"  - {cluster_id}: {len(userids)} users")
    
    def split_grades_file(self, grades_file: str):
        """
        Chia file grades (c·∫•u tr√∫c: id, timemodified, userid, courseid, finalgrade, itemtype)
        
        Parameters:
        -----------
        grades_file : str
            ƒê∆∞·ªùng d·∫´n t·ªõi file CSV grades
        """
        print(f"\nüìä Processing grades file: {grades_file}")
        
        # Load grades
        df = pd.read_csv(grades_file)
        print(f"   Total rows: {len(df)}")
        
        # Ki·ªÉm tra c·ªôt
        required_cols = ['userid']
        if not all(col in df.columns for col in required_cols):
            print(f"   ‚ùå Missing required columns. Found: {df.columns.tolist()}")
            return
        
        # L·ªçc theo valid userids
        df_filtered = df[df['userid'].isin(self.valid_userids)].copy()
        print(f"   Filtered rows: {len(df_filtered)} ({len(df_filtered)/len(df)*100:.1f}%)")
        
        # Th√™m c·ªôt cluster
        df_filtered['cluster'] = df_filtered['userid'].map(self.userid_to_cluster)
        
        # 1. Xu·∫•t file t·ªïng h·ª£p theo cluster
        for cluster_id in self.cluster_data.keys():
            cluster_df = df_filtered[df_filtered['cluster'] == cluster_id]
            output_file = self.output_dir / f"grades_{cluster_id}.csv"
            cluster_df.drop('cluster', axis=1).to_csv(output_file, index=False)
            print(f"   ‚úì Saved {output_file.name} ({len(cluster_df)} rows)")
        
        # 2. Xu·∫•t file ri√™ng cho t·ª´ng user
        user_dir = self.output_dir / "by_user" / "grades"
        user_dir.mkdir(exist_ok=True, parents=True)
        
        for userid in self.valid_userids:
            user_df = df_filtered[df_filtered['userid'] == userid]
            if len(user_df) > 0:
                cluster_id = self.userid_to_cluster[userid]
                output_file = user_dir / f"grades_user_{userid}_{cluster_id}.csv"
                user_df.drop('cluster', axis=1).to_csv(output_file, index=False)
        
        print(f"   ‚úì Saved individual user files to {user_dir}/")
        
        # 3. Summary stats
        summary = df_filtered.groupby('cluster').agg({
            'userid': 'nunique',
            'id': 'count'
        }).rename(columns={'userid': 'n_users', 'id': 'n_records'})
        
        summary_file = self.output_dir / "grades_summary.csv"
        summary.to_csv(summary_file)
        print(f"   ‚úì Saved summary to {summary_file.name}")
        print(f"\n   Summary:")
        print(summary.to_string())
        
        return df_filtered
    
    def split_logs_file(self, logs_file: str):
        """
        Chia file logs (c·∫•u tr√∫c: id, timecreated, eventname, action, target, userid, courseid, other)
        
        Parameters:
        -----------
        logs_file : str
            ƒê∆∞·ªùng d·∫´n t·ªõi file CSV logs
        """
        print(f"\nüìã Processing logs file: {logs_file}")
        
        # Load logs
        df = pd.read_csv(logs_file)
        print(f"   Total rows: {len(df)}")
        
        # Ki·ªÉm tra c·ªôt
        required_cols = ['userid']
        if not all(col in df.columns for col in required_cols):
            print(f"   ‚ùå Missing required columns. Found: {df.columns.tolist()}")
            return
        
        # L·ªçc theo valid userids
        df_filtered = df[df['userid'].isin(self.valid_userids)].copy()
        print(f"   Filtered rows: {len(df_filtered)} ({len(df_filtered)/len(df)*100:.1f}%)")
        
        # Th√™m c·ªôt cluster
        df_filtered['cluster'] = df_filtered['userid'].map(self.userid_to_cluster)
        
        # S·∫Øp x·∫øp theo userid v√† timecreated
        if 'timecreated' in df_filtered.columns:
            df_filtered = df_filtered.sort_values(['userid', 'timecreated'])
        
        # 1. Xu·∫•t file t·ªïng h·ª£p theo cluster
        for cluster_id in self.cluster_data.keys():
            cluster_df = df_filtered[df_filtered['cluster'] == cluster_id]
            output_file = self.output_dir / f"logs_{cluster_id}.csv"
            cluster_df.drop('cluster', axis=1).to_csv(output_file, index=False)
            print(f"   ‚úì Saved {output_file.name} ({len(cluster_df)} rows)")
        
        # 2. Xu·∫•t file ri√™ng cho t·ª´ng user
        user_dir = self.output_dir / "by_user" / "logs"
        user_dir.mkdir(exist_ok=True, parents=True)
        
        for userid in self.valid_userids:
            user_df = df_filtered[df_filtered['userid'] == userid]
            if len(user_df) > 0:
                cluster_id = self.userid_to_cluster[userid]
                output_file = user_dir / f"logs_user_{userid}_{cluster_id}.csv"
                user_df.drop('cluster', axis=1).to_csv(output_file, index=False)
        
        print(f"   ‚úì Saved individual user files to {user_dir}/")
        
        # 3. Summary stats
        summary = df_filtered.groupby('cluster').agg({
            'userid': 'nunique',
            'id': 'count'
        }).rename(columns={'userid': 'n_users', 'id': 'n_events'})
        
        # Th√™m top events
        if 'eventname' in df_filtered.columns:
            top_events = df_filtered.groupby('cluster')['eventname'].value_counts().groupby(level=0).head(5)
            print(f"\n   Top 5 events per cluster:")
            print(top_events.to_string())
        
        summary_file = self.output_dir / "logs_summary.csv"
        summary.to_csv(summary_file)
        print(f"\n   ‚úì Saved summary to {summary_file.name}")
        print(f"\n   Summary:")
        print(summary.to_string())
        
        return df_filtered
    
    def create_report(self, grades_df=None, logs_df=None):
        """
        T·∫°o b√°o c√°o t·ªïng h·ª£p
        """
        report = {
            'total_users': len(self.valid_userids),
            'clusters': {}
        }
        
        for cluster_id, userids in self.cluster_data.items():
            cluster_info = {
                'n_users': len(userids),
                'userids': userids
            }
            
            if grades_df is not None:
                cluster_grades = grades_df[grades_df['cluster'] == cluster_id]
                cluster_info['grades_records'] = len(cluster_grades)
            
            if logs_df is not None:
                cluster_logs = logs_df[logs_df['cluster'] == cluster_id]
                cluster_info['log_events'] = len(cluster_logs)
                
                if 'eventname' in cluster_logs.columns:
                    cluster_info['unique_event_types'] = cluster_logs['eventname'].nunique()
            
            report['clusters'][cluster_id] = cluster_info
        
        # Save report
        report_file = self.output_dir / "split_report.json"
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)
        
        print(f"\n‚úì Saved split report to {report_file}")
        
        return report


def main():
    """Main function"""
    print("="*70)
    print("DATA SPLITTER BY CLUSTER")
    print("="*70)
    
    # Configuration
    CLUSTER_FILE = "../data/processed/cluster_userids.json"
    GRADES_FILE = "../data/udk_moodle_grades_course_670.filtered.csv"  # Thay ƒë·ªïi path n·∫øu c·∫ßn
    LOGS_FILE = "../data/udk_moodle_log_course_670.filtered.csv"      # Thay ƒë·ªïi path n·∫øu c·∫ßn
    OUTPUT_DIR = "../data/processed/split_by_cluster"
    
    # Initialize splitter
    splitter = DataSplitter(CLUSTER_FILE, OUTPUT_DIR)
    
    # Split grades file
    grades_df = None
    if Path(GRADES_FILE).exists():
        grades_df = splitter.split_grades_file(GRADES_FILE)
    else:
        print(f"\n‚ö†Ô∏è  Grades file not found: {GRADES_FILE}")
    
    # Split logs file
    logs_df = None
    if Path(LOGS_FILE).exists():
        logs_df = splitter.split_logs_file(LOGS_FILE)
    else:
        print(f"\n‚ö†Ô∏è  Logs file not found: {LOGS_FILE}")
    
    # Create report
    if grades_df is not None or logs_df is not None:
        report = splitter.create_report(grades_df, logs_df)
        
        print(f"\n{'='*70}")
        print("‚úÖ SPLIT COMPLETE!")
        print("="*70)
        print(f"\nFiles created in: {splitter.output_dir}/")
        print(f"\nStructure:")
        print(f"  ‚îú‚îÄ‚îÄ grades_cluster_0.csv          # Grades for cluster 0")
        print(f"  ‚îú‚îÄ‚îÄ grades_cluster_1.csv          # Grades for cluster 1")
        print(f"  ‚îú‚îÄ‚îÄ logs_cluster_0.csv            # Logs for cluster 0")
        print(f"  ‚îú‚îÄ‚îÄ logs_cluster_1.csv            # Logs for cluster 1")
        print(f"  ‚îú‚îÄ‚îÄ grades_summary.csv            # Summary stats")
        print(f"  ‚îú‚îÄ‚îÄ logs_summary.csv              # Summary stats")
        print(f"  ‚îú‚îÄ‚îÄ split_report.json             # Complete report")
        print(f"  ‚îî‚îÄ‚îÄ by_user/")
        print(f"      ‚îú‚îÄ‚îÄ grades/")
        print(f"      ‚îÇ   ‚îú‚îÄ‚îÄ grades_user_8609_cluster_0.csv")
        print(f"      ‚îÇ   ‚îî‚îÄ‚îÄ ...")
        print(f"      ‚îî‚îÄ‚îÄ logs/")
        print(f"          ‚îú‚îÄ‚îÄ logs_user_8609_cluster_0.csv")
        print(f"          ‚îî‚îÄ‚îÄ ...")
    else:
        print(f"\n‚ùå No files processed!")


if __name__ == "__main__":
    main()


DATA SPLITTER BY CLUSTER
‚úì Loaded 15 userids from 2 clusters
  - cluster_0: 12 users
  - cluster_1: 3 users

üìä Processing grades file: ../data/udk_moodle_grades_course_670.filtered.csv
   Total rows: 211
   Filtered rows: 211 (100.0%)
   ‚úì Saved grades_cluster_0.csv (183 rows)
   ‚úì Saved grades_cluster_1.csv (28 rows)
   ‚úì Saved individual user files to ../data/processed/split_by_cluster/by_user/grades/
   ‚úì Saved summary to grades_summary.csv

   Summary:
           n_users  n_records
cluster                      
cluster_0       12        183
cluster_1        3         28

üìã Processing logs file: ../data/udk_moodle_log_course_670.filtered.csv
   Total rows: 10833
   Filtered rows: 10833 (100.0%)
   ‚úì Saved logs_cluster_0.csv (10093 rows)
   ‚úì Saved logs_cluster_1.csv (740 rows)
   ‚úì Saved individual user files to ../data/processed/split_by_cluster/by_user/logs/

   Top 5 events per cluster:
cluster    eventname                                   
cluster_0  \core