In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
import ast
import numpy as np
from scipy.stats import entropy
from datetime import datetime

## Consitency

In [2]:
def validate_range(row, phases):
    try:
        # 1. Kiểm tra định dạng user_id, course_id
        if not isinstance(row['user_id'], str) or not row['user_id'].startswith("U_"):
            return 0, 'user_id không hợp lệ'
        if not isinstance(row['course_id'], str) or not row['course_id'].startswith("C_"):
            return 0, 'course_id không hợp lệ'

        # 2. Trường học: string, 3-100 ký tự
        if not isinstance(row['school'], str) or not (3 <= len(row['school']) <= 100):
            return 0, 'school không hợp lệ'

        # 3. các năm: phải nằm trong khoảng năm 2019-2021
        if not (2019 <= row['end_year'] <= 2021) or not (2019 <= row['start_year'] <= 2021) or not (2019 <= row['user_year'] <= 2021):
            return 0, 'năm nằm ngoài khoảng 2019-2021'

        # 4. user_past_course_count: 0-1000
        if not (0 <= row['user_past_course_count'] <= 1000):
            return 0, 'user_past_course_count ngoài khoảng 0-1000'

        # 5. user_time_since_last_course: 0-1095
        if not (0 <= row['user_time_since_last_course'] <= 1095):
            return 0, 'user_time_since_last_course ngoài khoảng 0-1095'

        # 6. video_count: >=1
        if not (1 <= row['video_count']):
            return 0, 'video_count ít hơn 1'

        # 7. exercise_count: >= 1
        if not (1 <= row['exercise_count']):
            return 0, 'exercise_count ít hơn 1'

        # 8. chapter_count: >= 1
        if not (1 <= row['chapter_count']):
            return 0, 'chapter_count ít hơn 1'

        # 9. encoded_field_sum không âm
        if row['encoded_field_sum'] < 0:
            return 0, 'encoded_field_sum có giá trị âm'

        # 10. certificate: giá trị nhị phân 0 hoặc 1
        if row['certificate'] not in [0, 1]:
            return 0, 'certificate không hợp lệ (phải là 0 hoặc 1)'

        # 11. num_prerequisites: 0-50
        if not (0 <= row['num_prerequisites'] <= 50):
            return 0, 'num_prerequisites ngoài khoảng 0-50'

        # 12. assignment, video, exam: float 0-100
        for score_col in ['assignment', 'video', 'exam']:
            if not (0.0 <= row[score_col] <= 100.0):
                return 0, f'{score_col} ngoài khoảng 0-100'

        # 13. start_month, end_month, user_month: tháng hợp lệ 1-12
        for date_col in ['start_month', 'end_month', 'user_month']:
            if not (1 <= row[date_col] <= 12):
                return 0, f'{date_col} có tháng nằm ngoài 1-12'

        # 14. duration_days: 7 - 365
        if not (7 <= row['duration_days'] <= 365):
            return 0, 'duration_days ngoài khoảng 7-365'

        # 15. remaining_time không âm
        if row['remaining_time'] < 0:
            return 0, 'remaining_time có giá trị âm'

        # 16. Các chỉ số theo từng phase
        for phase in range(1, phases + 1):
            # Comment entropy (nếu có dùng)
            if not (0.0 <= row[f'entropy_time_comment_phase{phase}'] <= 1.0):
                return 0, f'entropy_time_comment_phase{phase} ngoài khoảng'

            # Exercise
            exercise_cols = [
                f'exercise_id_count_{phase}',
                f'exercise_correct_sum_{phase}',
                f'exercise_correct_mean_{phase}',
                f'exercise_num_problem_sum_{phase}',
                f'exercise_num_problem_mean_{phase}',
                f'exercise_attempts_sum_mean_{phase}',
                f'exercise_attempts_mean_mean_{phase}',
                f'exercise_date_from_enroll_min_{phase}',
                f'exercise_date_from_enroll_mean_{phase}',
                f'exercise_date_from_enroll_max_{phase}',
                f'exercise_context_sum_{phase}',
                f'exercise_context_mean_{phase}',
                f'exercise_diff_sum_{phase}',
                f'exercise_diff_mean_{phase}',
                f'exercise_diff_min_{phase}',
                f'exercise_diff_max_{phase}',
            ]
            for col in exercise_cols:
                if row[col] < 0:
                    return 0, f'{col} có giá trị âm'

            if not (0.0 <= row[f'exercise_langugage_binary_mean_{phase}'] <= 1.0):
                return 0, f'exercise_langugage_binary_mean_{phase} ngoài khoảng'

            perc_cols = [
                f'exercise_perc_goal_score_mean_{phase}',
                f'exercise_perc_real_completed_mean_{phase}',
                f'exercise_perc_real_completed_std_{phase}',
                f'exercise_perc_real_correct_mean_{phase}',
                f'exercise_perc_real_correct_std_{phase}',
                f'exercise_perc_real_score_sum_{phase}',
                f'exercise_perc_real_score_mean_{phase}',
                f'exercise_perc_real_score_std_{phase}',
            ]
            for col in perc_cols:
                if row[col] < 0 or row[col] > 100:
                    return 0, f'{col} ngoài khoảng 0-100'

            # Entropy
            if not (0.0 <= row[f'exercise_hour_entropy_{phase}'] <= 1.0):
                return 0, f'exercise_hour_entropy_{phase} ngoài khoảng 0-1'

            # Video features
            video_features = [
                f'video_watch_count_{phase}',
                f'video_watched_percentage_{phase}',
                f'video_percentage_watch_time_{phase}',
                f'video_pause_count_{phase}',
                f'video_pause_avg_{phase}',
                f'video_pause_std_{phase}',
                f'video_rewatch_avg_{phase}',
                f'video_rewatch_std_{phase}',
                f'video_time_between_views_avg_{phase}',
                f'video_time_between_views_std_{phase}',
                f'video_speed_avg_{phase}',
                f'entropy_time_{phase}',
            ]
            for col in video_features:
                if row[col] < 0:
                    return 0, f'{col} có giá trị âm'
                if "percentage" in col or "watched" in col or "entropy" in col:
                    if row[col] > 100:
                        return 0, f'{col} vượt quá 100%'

            # Sentiment features
            for col in [f'total_words_phase{phase}', f'total_positive{phase}', f'total_negative{phase}', f'total_neutral{phase}']:
                if row[col] < 0:
                    return 0, f'{col} có giá trị âm'

        return 1, 'Hợp lệ'

    except Exception as e:
        return 0, f'Lỗi xử lý: {str(e)}'


In [3]:
def is_not_null_pandas(row):
    return not row.isnull().any()

In [4]:
import numpy as np

def validate_data_type(row, phases=1):
    try:
        if not isinstance(row['user_id'], str): return 0, 'user_id không phải str'
        if not isinstance(row['course_id'], str): return 0, 'course_id không phải str'
        if not isinstance(row['school'], str): return 0, 'school không phải str'

        int_fields = [
            'user_past_course_count', 'user_time_since_last_course',
            'video_count', 'exercise_count', 'chapter_count',
            'num_prerequisites', 'duration_days',
            'start_year', 'end_year', 'user_year',
            'start_month', 'end_month', 'user_month',
            'certificate'
        ]
        for field in int_fields:
            if not isinstance(row[field], (int, float, np.integer, np.floating)):
                return 0, f'{field} không phải int/float'

        for field in ['assignment', 'video', 'exam', 'remaining_time']:
            if not isinstance(row[field], (int, float, np.integer, np.floating)):
                return 0, f'{field} không phải float/int'

        for phase in range(1, phases + 1):
            int_phase_fields = [
                f'exercise_id_count_{phase}',
                f'exercise_correct_sum_{phase}'
            ]
            for field in int_phase_fields:
                if not isinstance(row[field], (int, float, np.integer, np.floating)):
                    return 0, f'{field} không phải int/float'

            float_phase_fields = [
                f'exercise_correct_mean_{phase}',
                f'exercise_num_problem_sum_{phase}',
                f'exercise_num_problem_mean_{phase}',
                f'exercise_attempts_sum_mean_{phase}',
                f'exercise_attempts_mean_mean_{phase}',
                f'exercise_date_from_enroll_min_{phase}',
                f'exercise_date_from_enroll_mean_{phase}',
                f'exercise_date_from_enroll_max_{phase}',
                f'exercise_context_sum_{phase}',
                f'exercise_context_mean_{phase}',
                f'exercise_diff_sum_{phase}',
                f'exercise_diff_mean_{phase}',
                f'exercise_diff_min_{phase}',
                f'exercise_diff_max_{phase}',
                f'exercise_hour_entropy_{phase}',
                f'exercise_langugage_binary_mean_{phase}',
                f'exercise_perc_goal_score_mean_{phase}',
                f'exercise_perc_real_completed_mean_{phase}',
                f'exercise_perc_real_completed_std_{phase}',
                f'exercise_perc_real_correct_mean_{phase}',
                f'exercise_perc_real_correct_std_{phase}',
                f'exercise_perc_real_score_sum_{phase}',
                f'exercise_perc_real_score_mean_{phase}',
                f'exercise_perc_real_score_std_{phase}',
                f'entropy_time_comment_phase{phase}',
                f'entropy_time_{phase}',
                f'video_watch_count_{phase}',
                f'video_watched_percentage_{phase}',
                f'video_percentage_watch_time_{phase}',
                f'video_pause_count_{phase}',
                f'video_pause_avg_{phase}',
                f'video_pause_std_{phase}',
                f'video_rewatch_avg_{phase}',
                f'video_rewatch_std_{phase}',
                f'video_time_between_views_avg_{phase}',
                f'video_time_between_views_std_{phase}',
                f'video_speed_avg_{phase}',
                f'total_words_phase{phase}',
                f'total_positive{phase}',
                f'total_negative{phase}',
                f'total_neutral{phase}'
            ]
            for field in float_phase_fields:
                if not isinstance(row[field], (int, float, np.integer, np.floating)):
                    return 0, f'{field} không phải float/int'

        return 1, 'Hợp lệ'
    except Exception as e:
        return 0, f'Lỗi exception: {e}'


In [5]:
def check_valid_logic(row, max_phase=2):
    # 1. Tổng điểm thành phần phải bằng 100
    total_score = row['assignment'] + row['video'] + row['exam'] 
    if total_score != 100.0:
        return False, f"Tổng điểm không bằng 100: {total_score}"

    # 2. Thời gian còn lại phải nhỏ hơn hoặc bằng thời lượng khóa học
    if row['remaining_time'] > row['duration_days']:
        return False, f"Thời gian còn lại ({row['remaining_time']}) > thời lượng khóa học ({row['duration_days']})"

    # 3. Logic theo phase
    for phase in range(1, max_phase + 1):
        # 3.1 Exercise logic
        id_count = row[f'exercise_id_count_{phase}']
        correct_sum = row[f'exercise_correct_sum_{phase}']
        if id_count > 0:
            if row[f'exercise_perc_real_correct_mean_{phase}'] <= 0:
                return False, f"Phase {phase}: Có làm bài tập nhưng % đúng trung bình = 0"
        else:
            if correct_sum != 0:
                return False, f"Không làm bài tập phase {phase} nhưng có số câu đúng"

        # 3.2 Video logic
        watch_count = row.get(f'video_watch_count_{phase}', 0)
        if watch_count > 0:
            if row.get(f'video_watched_percentage_{phase}', 0) <= 0:
                return False, f"Phase {phase}: Có video xem nhưng phần trăm xem = 0"
        if row.get(f'video_watched_percentage_{phase}', 0) > 100:
            return False, f"Phase {phase}: Phần trăm video xem > 100%"

        # 3.3 Video pause logic
        pause_count = row.get(f'video_pause_count_{phase}', 0)
        if pause_count > 0:
            if row.get(f'video_pause_avg_{phase}', 0) <= 0:
                return False, f"Phase {phase}: Có pause nhưng avg pause = 0"

        # 3.5 Emotion / comment logic
        total_words = row.get(f'total_words_phase{phase}', 0)
        if total_words > 0:
            pos = row.get(f'total_positive{phase}', 0)
            neg = row.get(f'total_negative{phase}', 0)
            neu = row.get(f'total_neutral{phase}', 0)
            if (pos + neg + neu) == 0:
                return False, f"Phase {phase}: Có bình luận nhưng không có cảm xúc nào được ghi nhận"

    return True, "Hợp lệ"


In [6]:
def is_row_duplicated(row, df):
    # So sánh từng dòng trong df với dòng row (theo toàn bộ cột)
    mask = (df == row).all(axis=1)
    duplicate_count = mask.sum()
    return duplicate_count > 1  # True nếu bị lặp lại ít nhất 1 lần khác

In [7]:
user_info = pd.read_json("/kaggle/input/lightmooccubex/entities/user.json", lines = True)
user_info 

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,name,gender,school,year_of_birth,course_order,enroll_time
0,U_22,我,0.0,,2015.0,"[682129, 2294668]","[2019-10-12 10:28:02, 2020-11-21 14:03:28]"
1,U_24,王帅国,1.0,清华大学,6558.0,"[597214, 605512, 597211, 597314, 597208, 62950...","[2019-05-20 16:06:48, 2019-05-24 19:34:43, 201..."
2,U_25,王帅国,0.0,清华大学,,[1903985],[2020-08-07 18:59:13]
3,U_53,于歆杰,1.0,清华大学,1973.0,"[696679, 1704639, 943255, 1729417, 682164, 177...","[2020-03-01 21:24:30, 2020-03-12 16:17:02, 202..."
4,U_54,马昱春,2.0,清华大学,,"[682442, 682164, 1748240, 1778890, 1829031, 17...","[2019-10-09 02:17:49, 2019-11-08 00:49:03, 202..."
...,...,...,...,...,...,...,...
3330289,U_34712108,,0.0,,,"[697791, 782490, 799796]","[2020-10-12 03:39:14, 2020-10-12 03:41:00, 202..."
3330290,U_34712111,,2.0,,,[2199449],[2020-10-12 03:46:08]
3330291,U_34712112,,1.0,,,[916828],[2020-10-12 03:51:36]
3330292,U_34712114,,1.0,,,[735405],[2020-10-12 03:57:05]


In [8]:
course_info = pd.read_json("/kaggle/input/lightmooccubex/entities/course.json", lines = True)
course_info

Unnamed: 0,id,name,field,prerequisites,about,resource
0,C_584313,《资治通鉴》导读,"[历史学, 中国语言文学]",,通过老师导读，同学们可深入这一经典文本内部，得以纵览千年历史，提升国学素养，体味人生智慧。,"[{'titles': ['第一课 导论与三家分晋', '导论', '导论'], 'reso..."
1,C_584329,微积分——极限理论与一元函数,"[应用经济学, 数学, 物理学, 理论经济学]",,本课程是理工科的一门数学基础课，系统、全面地介绍了一元函数微积分学。课程既保持了数学的严谨和...,"[{'titles': ['序言', '序言', '序言'], 'resource_id':..."
2,C_584381,新闻摄影,"[艺术学, 新闻传播学]",,掌握基本的摄影技能，了解图片新闻的工作方式，训练对生活的观察和热爱，发展对图像的审美和批评能...,"[{'titles': ['第一章 绪论', '第一讲 引言1', '引言1'], 'res..."
3,C_597208,数据挖掘：理论与算法,[计算机科学与技术],,最有趣的理论+最有用的算法=不得不学的数据科学。,"[{'titles': ['走进数据科学：博大精深，美不胜收', '整装待发', 'Vide..."
4,C_597225,大学计算机,[],,大学计算机课程将以计算思维为导向，以计算机原理、概念为基础，以新技术新方法为牵引，以创新思维...,"[{'titles': ['第1周： 基于计算机的问题求解', '课程介绍', '开篇'],..."
...,...,...,...,...,...,...
3776,C_2338076,（疾风计划）软件工程,[],,疾风计划2021【首期限额招募】进行中\n名校名师丨专属助教丨专业认证丨系统全面丨修炼内功丨...,"[{'titles': ['第1章 初识软件工程 ', '1.1 软件无处不在 ', '讲课..."
3777,C_2341259,（疾风计划）面向对象程序设计（C++）,[],,疾风计划2021【首期限额招募】进行中\n名校名师丨专属助教丨专业认证丨系统全面丨修炼内功丨...,"[{'titles': ['第一讲 课程简介与编程环境', '1.0 课程定位、教学内容',..."
3778,C_2337996,（疾风计划）数据结构(下),[],,疾风计划2021【首期限额招募】进行中\n名校名师丨专属助教丨专业认证丨系统全面丨修炼内功丨...,"[{'titles': ['第零章 ', '选课之前', '宣传片'], 'resourc..."
3779,C_1945689,机器学习训练营,[],,清华张敏老师带你12周掌握机器学习！8大经典算法 +7大实训 + 直播点评 + 博士助教+云...,"[{'titles': ['序-开营仪式及学习课件', None, '1.1_初识机器学习'..."


In [9]:
def valid_foreign_key(row, valid_user_ids, valid_course_ids):
    user_id = str(row['user_id']).strip()
    course_id = str(row['course_id']).strip()
    return user_id in valid_user_ids and course_id in valid_course_ids

# Chuyển thành list để dùng `in`
valid_user_ids = user_info["id"].astype(str).str.strip().tolist()
valid_course_ids = course_info["id"].astype(str).str.strip().tolist()

In [10]:
data = pd.read_csv("/kaggle/input/final-data/phase4/user_train_phase_4.csv")
data

Unnamed: 0,user_id,school,course_id,encoded_field_sum,start_year,start_month,end_year,end_month,user_year,user_month,...,video_time_between_views_std_4,video_speed_avg_4,entropy_time_4,total_words_phase4,total_positive4,total_negative4,total_neutral4,total_score,label,label_encoded
0,U_10000,,C_2033958,100,2020.0,9.0,2020.0,12.0,2020.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.219780,E,0
1,U_1000979,云南大学,C_947149,66,2019.0,12.0,2020.0,4.0,2020.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.000000,D,1
2,U_1001176,云南大学,C_947149,66,2019.0,12.0,2020.0,4.0,2020.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.772894,D,1
3,U_1001563,昆明理工大学,C_735164,31,2020.0,9.0,2020.0,12.0,2020.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.795455,D,1
4,U_1001625,昆明理工大学,C_735164,31,2020.0,9.0,2020.0,12.0,2020.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.795455,D,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83778,U_99746,河南工学院,C_674971,49,2020.0,2.0,2020.0,7.0,2020.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.858063,E,0
83779,U_997506,哈尔滨工业大学,C_2095102,105,2020.0,9.0,2020.0,12.0,2020.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.888889,E,0
83780,U_99753,安徽财经大学,C_1428968,80,2020.0,9.0,2020.0,12.0,2020.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73.800000,B,3
83781,U_997542,成都信息工程大学,C_2066096,104,2020.0,9.0,2020.0,12.0,2020.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.666667,C,2


In [11]:
data = data[:40000]

In [12]:
row0 = data.iloc[3]
row0

user_id              U_1001563
school                  昆明理工大学
course_id             C_735164
encoded_field_sum           31
start_year              2020.0
                       ...    
total_negative4            0.0
total_neutral4             0.0
total_score          57.795455
label                        D
label_encoded                1
Name: 3, Length: 197, dtype: object

In [13]:
print("Kiểm tra 1 dòng (dòng 0):")
print("Không null:", is_not_null_pandas(row0))
print("Kiểu dữ liệu hợp lệ:", validate_data_type(row0, 4))
print("Range hợp lệ:", validate_range(row0, 4))
print("Logic hợp lệ:", check_valid_logic(row0, 4))
print("Khóa ngoại hợp lệ:", valid_foreign_key(row0, valid_user_ids, valid_course_ids))
print("Bị lặp không:", is_row_duplicated(row0, data))

Kiểm tra 1 dòng (dòng 0):
Không null: True
Kiểu dữ liệu hợp lệ: (1, 'Hợp lệ')
Range hợp lệ: (1, 'Hợp lệ')
Logic hợp lệ: (True, 'Hợp lệ')
Khóa ngoại hợp lệ: True
Bị lặp không: False


In [14]:
import pandas as pd
from tqdm import tqdm

def extract_result(val):
    """Hàm hỗ trợ để trích giá trị đầu tiên nếu là tuple, hoặc giữ nguyên nếu là bool/int"""
    if isinstance(val, tuple):
        return val[0] == 1
    return bool(val)

def check_all_criteria(row, data, valid_user_ids, valid_course_ids, phases=1):
    try:
        criteria_results = {
            "Không null": extract_result(is_not_null_pandas(row)),
            "Kiểu dữ liệu hợp lệ": extract_result(validate_data_type(row, phases)),
            "Range hợp lệ": extract_result(validate_range(row, phases)),
            "Logic hợp lệ": extract_result(check_valid_logic(row, phases)),
            "Khóa ngoại hợp lệ": extract_result(valid_foreign_key(row, valid_user_ids, valid_course_ids)),
            "Bị lặp không": not extract_result(is_row_duplicated(row, data))
        }
        percent_passed = round(sum(criteria_results.values()) / len(criteria_results) * 100, 2)
    except Exception as e:
        print(f"Lỗi tại dòng index {row.name}: {e}")
        criteria_results = {key: False for key in [
            "Không null", "Kiểu dữ liệu hợp lệ", "Range hợp lệ",
            "Logic hợp lệ", "Khóa ngoại hợp lệ", "Bị lặp không"
        ]}
        percent_passed = 0.0

    return percent_passed, criteria_results
# Hàm tổng hợp kết quả kiểm tra toàn bộ dataset
def evaluate_dataset(data, valid_user_ids, valid_course_ids, phases=1, limit=None):
    percent_list = []
    details_list = []

    if limit:
        data = data.head(limit).copy()

    for _, row in tqdm(data.iterrows(), total=len(data), desc="Đánh giá dòng"):
        percent, details = check_all_criteria(row, data, valid_user_ids, valid_course_ids, phases)
        percent_list.append(percent)
        details_list.append(details)

    criteria_df = pd.DataFrame(details_list)
    criteria_df['percent_passed'] = percent_list

    return criteria_df

# ---- Gọi hàm chính để đánh giá ----
criteria_df = evaluate_dataset(data, valid_user_ids, valid_course_ids, phases=2)

# Kết quả thống kê
avg_percent = round(criteria_df['percent_passed'].mean(), 4)
print(f"\n✅ Trung bình % tiêu chí thỏa mãn: {avg_percent}%")

print("\n📊 Tỷ lệ từng tiêu chí thỏa mãn (%):")
# Chỉ chọn các cột bool/int để tính mean
bool_columns = criteria_df.select_dtypes(include=['bool', 'int', 'float']).drop(columns='percent_passed')
print((bool_columns.mean() * 100).round(2))

print("\n📈 Thống kê:")
print(criteria_df.describe())


Đánh giá dòng: 100%|██████████| 40000/40000 [4:20:57<00:00,  2.55it/s]



✅ Trung bình % tiêu chí thỏa mãn: 75.7168%

📊 Tỷ lệ từng tiêu chí thỏa mãn (%):
Không null              58.73
Kiểu dữ liệu hợp lệ     58.73
Range hợp lệ            38.44
Logic hợp lệ            98.40
Khóa ngoại hợp lệ      100.00
Bị lặp không           100.00
dtype: float64

📈 Thống kê:
       percent_passed
count    40000.000000
mean        75.716794
std         22.742599
min         33.330000
25%         50.000000
50%         83.330000
75%        100.000000
max        100.000000
