In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam
import datetime

# Load dữ liệu (như code trước)
tbtl_data = pd.read_excel("/kaggle/input/wecode-public-it001/data-tbtl/tbtl-public.ods", engine="odf")
th_data = pd.read_csv("/kaggle/input/wecode-public-it001/public_it001/th-public.csv")
qt_data = pd.read_csv("/kaggle/input/wecode-public-it001/public_it001/qt-public.csv")
anno_data = pd.read_csv("/kaggle/input/wecode-public-it001/data-tbtl/annonimized.csv")
ck_data = pd.read_csv("/kaggle/input/wecode-public-it001/public_it001/ck-public.csv")

anno_data.columns = ['assignment_id','problem_id','username','is_final','status','pre_score','coefficient','language_id','created_at','updated_at','judgement']

In [17]:
def time_diff(df):
    current_year = 2024
    
    # Xử lý 'created_at'
    try:
        df['created_at'] = pd.to_datetime(df['created_at'].astype(str) + '-' + str(current_year), format='%m-%d %H:%M:%S-%Y')
    except (ValueError, TypeError):
        df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

    # Xử lý 'updated_at' (đảm bảo không bị bỏ qua)
    try:
        df['updated_at'] = pd.to_datetime(df['updated_at'].astype(str) + '-' + str(current_year), format='%m-%d %H:%M:%S-%Y')
    except (ValueError, TypeError):
        df['updated_at'] = pd.to_datetime(df['updated_at'], errors='coerce')

    # Tính 'time_diff' giữa 'updated_at' và 'created_at'
    df['time_diff'] = (df['updated_at'] - df['created_at']).dt.total_seconds().abs()
    return df
    
def one_hot_encode(tasks, all_tasks):
    return [1 if task in tasks else 0 for task in all_tasks]
    
def tasks_to_embedding(tasks, embedding_matrix):
        vectors = [embedding_matrix[task - 1] for task in tasks]  # Trừ 1 để phù hợp index
        return np.mean(vectors, axis=0)  # Trung bình các vector

In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def clean_and_aggregate_data(anno_data):
    cols_to_drop = ['language_id', 'created_at', 'updated_at', 'judgement', "is_final", "status"]
    df = anno_data.drop(columns=cols_to_drop)

    le = LabelEncoder()
    df['assignment_id'] = le.fit_transform(df['assignment_id'])

    df["submission_count"] = df.groupby(["assignment_id", "username"])["assignment_id"].transform('size')

    df = df.groupby(["username", "assignment_id"]).agg(
        average_pre_score=("pre_score", "mean"),
        average_coefficient=("coefficient", "mean"),
        submission_count=("submission_count", "max")
    ).reset_index()
    df["average_pre_score"] = df["average_pre_score"]
    df["average_coefficient"] = df["average_coefficient"]
    df = df.groupby('username').agg({
        'assignment_id': lambda x: list(x),
        'average_pre_score': 'mean',
        'average_coefficient': 'mean',
        'submission_count': 'sum'
    }).reset_index()
    df2 = anno_data
    df2 = time_diff(df2)
    
    # Tính số lần submission
    df2["submission_count"] = df2.groupby(["assignment_id", "problem_id", "username"]).cumcount() + 1
    df2['total_time'] = df2.groupby(["assignment_id", "problem_id", "username"])['time_diff'].transform('sum')
    
    # Lọc ra 1 dòng cuối cho mỗi nhóm
    df2 = df2.groupby(["assignment_id", "problem_id", "username"]).tail(1).reset_index(drop=True)
    df2 = df2.groupby("username").agg(
        total_problems=("problem_id", "count"),
        total_time=("total_time", "sum"),
    ).reset_index()
    
    df_merged = df.merge(df2[['username', 'total_problems',"total_time"]], on='username', how='left')

    #thêm đặc trưng
    df_merged["solved_ratio"] = (df_merged["total_problems"] / df_merged["submission_count"])
    df_merged['submission_efficiency'] = df_merged['submission_count'] / df_merged['total_problems']
    df_merged['adjusted_pre_score'] = df_merged['average_pre_score'] * df_merged['average_coefficient']
    df_merged['log_submission_count'] = np.log1p(df_merged['submission_count'])
    df_merged['log_total_problems'] = np.log1p(df_merged['total_problems'])
    df_merged['poly_4_average_pre_score'] = df_merged['average_pre_score'] ** 4
    
    #onehot_encode
    assignment_id_uniques = set(task for tasks in df_merged['assignment_id'] for task in tasks)
    assignment_id_uniques = sorted(list(assignment_id_uniques))

        
    df_merged['assignment_id'] = df_merged['assignment_id'].apply(lambda x: one_hot_encode(x, assignment_id_uniques))

    # tạo embedding
    num_tasks = len(anno_data['assignment_id'].unique())
    embedding_dim = 16
    embedding_matrix = np.random.rand(num_tasks, embedding_dim)
    
    df_merged['embedding'] = df_merged['assignment_id'].apply(lambda x: tasks_to_embedding(x, embedding_matrix))

    #thêm đặc trưng
    df_merged['embedding_mean'] = df_merged['embedding'].apply(lambda x: np.mean(x))
    df_merged['embedding_std'] = df_merged['embedding'].apply(lambda x: np.std(x))
    df_merged['embedding_max'] = df_merged['embedding'].apply(lambda x: np.max(x))
    df_merged['embedding_min'] = df_merged['embedding'].apply(lambda x: np.min(x))
    return df_merged

df = clean_and_aggregate_data(anno_data)
df.head()

Unnamed: 0,username,assignment_id,average_pre_score,average_coefficient,submission_count,total_problems,total_time,solved_ratio,submission_efficiency,adjusted_pre_score,log_submission_count,log_total_problems,poly_4_average_pre_score,embedding,embedding_mean,embedding_std,embedding_max,embedding_min
0,00b6dd4fc7eb817e03708c532016ef30ce564a61,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",5838.193956,100.0,147,47,1204728.0,0.319728,3.12766,583819.395604,4.997212,3.871201,1161754000000000.0,"[0.4477206981670808, 0.08610641438785292, 0.09...",0.45944,0.279265,0.891446,0.085695
1,00bef8afee8f3c595d535c9c03c490cac1a4f021,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5473.090254,100.0,259,78,65265092.0,0.301158,3.320513,547309.025449,5.560682,4.369448,897285100000000.0,"[0.44460607762358695, 0.08848190472553041, 0.1...",0.460596,0.276794,0.891464,0.088482
2,01122b3ef7e59b84189e65985305f575d6bdf83c,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6953.009206,100.0,195,68,23636384.0,0.348718,2.867647,695300.920552,5.278115,4.234107,2337175000000000.0,"[0.4477206981670808, 0.08610641438785299, 0.09...",0.45944,0.279265,0.891446,0.085695
3,0134f9f410c65ad0e8c2254a7e9288670e02a183,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6952.644118,100.0,100,47,151409.0,0.47,2.12766,695264.411765,4.615121,3.871201,2336684000000000.0,"[0.4523926289823216, 0.08254317888133689, 0.08...",0.457707,0.283016,0.89142,0.075689
4,013de369c439ab0ead8aa7da64423aa395a8be39,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",6454.25954,87.5,107,66,19692.0,0.616822,1.621212,564747.709741,4.682131,4.204693,1735344000000000.0,"[0.4461633878953339, 0.08729415955669169, 0.09...",0.460018,0.278027,0.891455,0.087294


In [19]:
tbtl_data.head()

Unnamed: 0,username,TBTL
0,00b6dd4fc7eb817e03708c532016ef30ce564a61,7.24
1,00bef8afee8f3c595d535c9c03c490cac1a4f021,8.11
2,01122b3ef7e59b84189e65985305f575d6bdf83c,7.3
3,0134f9f410c65ad0e8c2254a7e9288670e02a183,8.63
4,013de369c439ab0ead8aa7da64423aa395a8be39,8.2


In [20]:
tbtl_data['TBTL'] = pd.to_numeric(tbtl_data['TBTL'], errors='coerce')
tbtl_data = tbtl_data.dropna(subset=['TBTL'])
print(tbtl_data['TBTL'].isna().sum()) 

0


In [21]:
print(f"Total_student = {df.shape[0]}")

anno_student = set(df["username"])
print (f"length anno_student = {len(anno_student)}")

tbtl_student = set(tbtl_data["username"])
print (f"length tbtl_student = {len(tbtl_student)}")

predict_student = anno_student - tbtl_student
print (f"length predict_student = {len(predict_student)}")

Total_student = 1489
length anno_student = 1489
length tbtl_student = 799
length predict_student = 690


In [22]:
predict_df = df[df["username"].isin(predict_student)]

print(f"predict_student = {predict_df.shape[0]}")

predict_df.head()

predict_student = 690


Unnamed: 0,username,assignment_id,average_pre_score,average_coefficient,submission_count,total_problems,total_time,solved_ratio,submission_efficiency,adjusted_pre_score,log_submission_count,log_total_problems,poly_4_average_pre_score,embedding,embedding_mean,embedding_std,embedding_max,embedding_min
394,410357eb9129023509cfaf8d38be61c050bb3b05,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5542.025641,100.0,43,16,12714.0,0.372093,2.6875,554202.564103,3.78419,2.833213,943352800000000.0,"[0.45550724952581545, 0.08016768854365947, 0.0...",0.456551,0.285544,0.891403,0.069019
625,67212308d026508fd5b6942ffbbdd7b0be2e89de,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,100.0,8,2,48.0,0.25,4.0,0.0,2.197225,1.098612,0.0,"[0.4570645597975624, 0.07897994337482074, 0.07...",0.455973,0.286817,0.891394,0.065683
801,84a17972cc6d29489bbe205a9e7feb8745726fbc,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7742.667691,100.0,182,110,1460062.0,0.604396,1.654545,774266.769073,5.209486,4.70953,3593871000000000.0,"[0.44927800843882776, 0.08491866921901427, 0.0...",0.458862,0.28051,0.891438,0.08236
802,84b6b2d70924066c8345f2bc2281791ae3188da2,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",6787.26861,100.0,160,82,62797979.0,0.5125,1.95122,678726.860957,5.081404,4.418841,2122170000000000.0,"[0.4477206981670807, 0.08610641438785302, 0.09...",0.45944,0.279265,0.891446,0.085695
803,851d9a4b9b8e236f2d62282ddf06fae57b7d9492,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5277.509215,100.0,389,97,11657734.0,0.249357,4.010309,527750.921516,5.966147,4.584967,775739700000000.0,"[0.4399341468083461, 0.09204514023204652, 0.11...",0.462329,0.273133,0.891489,0.092045


In [23]:
df = pd.merge(df, tbtl_data, left_on='username',right_on='username', how='inner')

In [24]:
df

Unnamed: 0,username,assignment_id,average_pre_score,average_coefficient,submission_count,total_problems,total_time,solved_ratio,submission_efficiency,adjusted_pre_score,log_submission_count,log_total_problems,poly_4_average_pre_score,embedding,embedding_mean,embedding_std,embedding_max,embedding_min,TBTL
0,00b6dd4fc7eb817e03708c532016ef30ce564a61,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",5838.193956,100.0,147,47,1204728.0,0.319728,3.127660,583819.395604,4.997212,3.871201,1.161754e+15,"[0.4477206981670808, 0.08610641438785292, 0.09...",0.459440,0.279265,0.891446,0.085695,7.24
1,00bef8afee8f3c595d535c9c03c490cac1a4f021,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5473.090254,100.0,259,78,65265092.0,0.301158,3.320513,547309.025449,5.560682,4.369448,8.972851e+14,"[0.44460607762358695, 0.08848190472553041, 0.1...",0.460596,0.276794,0.891464,0.088482,8.11
2,01122b3ef7e59b84189e65985305f575d6bdf83c,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6953.009206,100.0,195,68,23636384.0,0.348718,2.867647,695300.920552,5.278115,4.234107,2.337175e+15,"[0.4477206981670808, 0.08610641438785299, 0.09...",0.459440,0.279265,0.891446,0.085695,7.30
3,0134f9f410c65ad0e8c2254a7e9288670e02a183,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6952.644118,100.0,100,47,151409.0,0.470000,2.127660,695264.411765,4.615121,3.871201,2.336684e+15,"[0.4523926289823216, 0.08254317888133689, 0.08...",0.457707,0.283016,0.891420,0.075689,8.63
4,013de369c439ab0ead8aa7da64423aa395a8be39,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",6454.259540,87.5,107,66,19692.0,0.616822,1.621212,564747.709741,4.682131,4.204693,1.735344e+15,"[0.4461633878953339, 0.08729415955669169, 0.09...",0.460018,0.278027,0.891455,0.087294,8.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,840d07858c03f80f4695056e2cc7d0c474b83a25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5549.727439,100.0,111,45,2979960.0,0.405405,2.466667,554972.743863,4.718499,3.828641,9.486076e+14,"[0.44927800843882776, 0.08491866921901432, 0.0...",0.458862,0.280510,0.891438,0.082360,7.67
795,844f5db2e7e31ae51eba025480679ed7e4708ac6,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...",6062.129658,100.0,357,104,72252973.0,0.291317,3.432692,606212.965778,5.880533,4.653960,1.350520e+15,"[0.43214759544961145, 0.09798386607624, 0.1342...",0.465218,0.267157,0.891533,0.097984,7.54
796,845acd04a77b3d1b623f255d9f9f8eae90892dab,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5426.576744,100.0,155,53,7109127.0,0.341935,2.924528,542657.674394,5.049856,3.988984,8.671691e+14,"[0.4461633878953339, 0.08729415955669174, 0.09...",0.460018,0.278027,0.891455,0.087294,7.49
797,8460eaaf887a6289fb156f7562fb739ba8e9629e,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4589.870756,100.0,557,115,35837359.0,0.206463,4.843478,458987.075609,6.324359,4.753590,4.438148e+14,"[0.4399341468083462, 0.0920451402320465, 0.114...",0.462329,0.273133,0.891489,0.092045,8.94


In [25]:
columns_features =['average_pre_score', 'average_coefficient', 'solved_ratio','submission_efficiency',
                   'submission_count','total_problems',
                   'log_submission_count','log_total_problems',
                   'poly_4_average_pre_score']

features = df[columns_features]

X = features
y = df['TBTL']
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X_train[1]

array([-3.43301181,  0.2157189 ,  1.55956468, -1.09683204, -1.40286684,
       -1.77627595, -2.50332064, -2.33607087, -1.01601586])

# Train bình thường😉

In [27]:
# Xây dựng mô hình mạng học sâu
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(64, activation='relu')) 
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))
# Biên dịch mô hình
model.compile(optimizer=Adam(learning_rate=0.005), loss='mse')

# Huấn luyện mô hình
history = model.fit(X_train, y_train, epochs=80, batch_size=16, validation_data=(X_test, y_test), verbose=False)

# Dự đoán trên tập kiểm tra
y_pred = model.predict(X_test)

# Tính R^2
r2 = r2_score(y_test, y_pred)
print(f'R^2: {r2}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
R^2: 0.06271900525324325


In [28]:
predict_features = predict_df[columns_features]
X_predict = predict_features

X_predict_scaled = scaler.fit_transform(X_predict)
y_real_pred = model.predict(X_predict_scaled)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [29]:
# Assign the rounded values to the predicted_scores column
predict_df["predicted_score"] = np.round(y_real_pred, 3)

predict_df.head()

Unnamed: 0,username,assignment_id,average_pre_score,average_coefficient,submission_count,total_problems,total_time,solved_ratio,submission_efficiency,adjusted_pre_score,log_submission_count,log_total_problems,poly_4_average_pre_score,embedding,embedding_mean,embedding_std,embedding_max,embedding_min,predicted_score
394,410357eb9129023509cfaf8d38be61c050bb3b05,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5542.025641,100.0,43,16,12714.0,0.372093,2.6875,554202.564103,3.78419,2.833213,943352800000000.0,"[0.45550724952581545, 0.08016768854365947, 0.0...",0.456551,0.285544,0.891403,0.069019,7.475
625,67212308d026508fd5b6942ffbbdd7b0be2e89de,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,100.0,8,2,48.0,0.25,4.0,0.0,2.197225,1.098612,0.0,"[0.4570645597975624, 0.07897994337482074, 0.07...",0.455973,0.286817,0.891394,0.065683,7.682
801,84a17972cc6d29489bbe205a9e7feb8745726fbc,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7742.667691,100.0,182,110,1460062.0,0.604396,1.654545,774266.769073,5.209486,4.70953,3593871000000000.0,"[0.44927800843882776, 0.08491866921901427, 0.0...",0.458862,0.28051,0.891438,0.08236,7.736
802,84b6b2d70924066c8345f2bc2281791ae3188da2,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",6787.26861,100.0,160,82,62797979.0,0.5125,1.95122,678726.860957,5.081404,4.418841,2122170000000000.0,"[0.4477206981670807, 0.08610641438785302, 0.09...",0.45944,0.279265,0.891446,0.085695,7.636
803,851d9a4b9b8e236f2d62282ddf06fae57b7d9492,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5277.509215,100.0,389,97,11657734.0,0.249357,4.010309,527750.921516,5.966147,4.584967,775739700000000.0,"[0.4399341468083461, 0.09204514023204652, 0.11...",0.462329,0.273133,0.891489,0.092045,8.016


In [30]:
output_df = predict_df[["username", "predicted_score"]]

output_df.to_csv("output_th_new.csv", index=False, header=False)