In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten
from tensorflow.keras.optimizers import Adam
import datetime

# Load dữ liệu (như code trước)
tbtl_data = pd.read_excel("/kaggle/input/wecode-public-it001/data-tbtl/tbtl-public.ods", engine="odf")
th_data = pd.read_csv("/kaggle/input/wecode-public-it001/public_it001/th-public.csv")
qt_data = pd.read_csv("/kaggle/input/wecode-public-it001/public_it001/qt-public.csv")
anno_data = pd.read_csv("/kaggle/input/wecode-public-it001/data-tbtl/annonimized.csv")
ck_data = pd.read_csv("/kaggle/input/wecode-public-it001/public_it001/ck-public.csv")

anno_data.columns = ['assignment_id','problem_id','username','is_final','status','pre_score','coefficient','language_id','created_at','updated_at','judgement']

In [2]:
print(anno_data.shape[0])
anno_data.head(3)

295198


Unnamed: 0,assignment_id,problem_id,username,is_final,status,pre_score,coefficient,language_id,created_at,updated_at,judgement
0,90ce27571176d87961b565d5ef4b3de33ede04ac,789454427dd4097a14749e3dde63346b7a8d3811,ed9eaeb6a707f50154024b24d7efcb874a9795dd,0,SCORE,0,100,it0012,10/9/2024 8:02,10/9/2024 8:06,"{""times"":[0,0,0,0,0,0,0,0,0,0],""mems"":[0,0,0,0..."
1,90ce27571176d87961b565d5ef4b3de33ede04ac,789454427dd4097a14749e3dde63346b7a8d3811,ed9eaeb6a707f50154024b24d7efcb874a9795dd,0,SCORE,0,100,it0012,10/9/2024 8:04,10/9/2024 8:04,"{""times"":[0,0,0,0,0,0,0,0,0,0],""mems"":[0,0,0,0..."
2,90ce27571176d87961b565d5ef4b3de33ede04ac,789454427dd4097a14749e3dde63346b7a8d3811,ed9eaeb6a707f50154024b24d7efcb874a9795dd,1,SCORE,10000,100,it0012,10/9/2024 8:06,10/9/2024 8:06,"{""times"":[0,0,0,0,0,0,0,0,0,0],""mems"":[0,0,0,0..."


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def clean_and_aggregate_data(anno_data):
    cols_to_drop = ['language_id', 'updated_at', 'judgement', "is_final", "status"]
    df = anno_data.drop(columns=cols_to_drop)
    
    df["submission_count"] = df.groupby(["assignment_id", "problem_id", "username"]).cumcount() + 1
    
    df = df.groupby(["assignment_id", "problem_id", "username"]).tail(1).reset_index(drop=True)
    
    df["created_at"] = pd.to_datetime(df["created_at"])
    df["submission_day"] = df["created_at"].dt.date

    total_days = df.groupby("username")["submission_day"].nunique().reset_index()
    total_days.rename(columns={"submission_day": "total_days"}, inplace=True)

    df = df.groupby("username").agg(
        total_problems=("problem_id", "count"),         
        total_submission_count=("submission_count", "sum"),
        average_pre_score=("pre_score", "mean"),
        average_coefficient=("coefficient", "mean")).reset_index()
    df["submission_per_problem"] = (df["total_problems"] / df["total_submission_count"])

    df = pd.merge(df, total_days, on="username", how="left")
    return df

df = clean_and_aggregate_data(anno_data)
df.head(5)


Unnamed: 0,username,total_problems,total_submission_count,average_pre_score,average_coefficient,submission_per_problem,total_days
0,00b6dd4fc7eb817e03708c532016ef30ce564a61,47,147,10000.0,100.0,0.319728,12
1,00bef8afee8f3c595d535c9c03c490cac1a4f021,78,259,9535.24359,100.0,0.301158,17
2,01122b3ef7e59b84189e65985305f575d6bdf83c,68,195,9201.529412,100.0,0.348718,21
3,0134f9f410c65ad0e8c2254a7e9288670e02a183,47,100,10000.0,100.0,0.47,11
4,013de369c439ab0ead8aa7da64423aa395a8be39,66,107,9017.651515,96.969697,0.616822,8


In [4]:
th_data['TH'] = pd.to_numeric(th_data['TH'], errors='coerce')
th_data = th_data.dropna(subset=['TH'])
print(th_data['TH'].isna().sum()) 

0


In [5]:
print(f"Total_student = {df.shape[0]}")

anno_student = set(df["username"])
print (f"length anno_student = {len(anno_student)}")

tbtl_student = set(tbtl_data["username"])
print (f"length tbtl_student = {len(tbtl_student)}")

predict_student = anno_student - tbtl_student
print (f"length predict_student = {len(predict_student)}")

Total_student = 1489
length anno_student = 1489
length tbtl_student = 799
length predict_student = 690


In [6]:
predict_df = df[df["username"].isin(predict_student)]

print(f"predict_student = {predict_df.shape[0]}")

predict_df.head()

predict_student = 690


Unnamed: 0,username,total_problems,total_submission_count,average_pre_score,average_coefficient,submission_per_problem,total_days
394,410357eb9129023509cfaf8d38be61c050bb3b05,16,43,9375.0,100.0,0.372093,5
625,67212308d026508fd5b6942ffbbdd7b0be2e89de,2,8,0.0,100.0,0.25,1
801,84a17972cc6d29489bbe205a9e7feb8745726fbc,110,182,10000.0,100.0,0.604396,31
802,84b6b2d70924066c8345f2bc2281791ae3188da2,82,160,9594.317073,100.0,0.5125,30
803,851d9a4b9b8e236f2d62282ddf06fae57b7d9492,97,389,9632.216495,100.0,0.249357,27


In [7]:
df = pd.merge(df, th_data, left_on='username',right_on='hash', how='inner')

Poly 7 average pre score 

poly 2 is the best

In [8]:
df['poly_2_average_pre_score'] = df['average_pre_score'] ** 2
df['poly_3_average_pre_score'] = df['average_pre_score'] ** 3

log total submission = > gud result

no poly submission count

In [9]:
import matplotlib.pyplot as plt

df['log_submission_count'] = np.log(df['total_submission_count'] + 1)

log total problem => gud

In [10]:
import numpy as np 

df['log_total_problems'] = np.log(df['total_problems'] + 1)

df['poly_2_total_problems'] = df['total_problems'] **2
df['poly_3_total_problems'] = df['total_problems'] **3

cosistency = total_days * total submission => it is non linearity should try log = > okay quite gud feature

variety = consistency * total problems => bad 

submission_per_problem vs th, normal one => too spread out

=> try log

In [11]:
df['log_submission_per_problem'] = np.log1p(df['submission_per_problem'])
df['square_submission_per_problem'] = np.sqrt(df['submission_per_problem'])

check average_coefficient => damn like shiet, try out log

In [12]:
df['log_average_coefficient'] = np.log1p(df['average_coefficient'])

df['sqrt_average_coefficient'] = np.sqrt(df['average_coefficient'])

lay ngay nhan voi diem

In [13]:
df['score_x_days'] = df['average_pre_score'] * df['total_days']

df['log_score_x_days'] = np.log1p(df['score_x_days'])

df['poly_2_score_x_days'] = df['score_x_days'] **2

df['square_score_x_days'] = np.sqrt(df['average_pre_score'] * df['total_days'])

sub * day * prob = > bullshiet

prob x scores => oke

In [14]:
df['scores_probs'] = df['average_pre_score'] * df['total_problems']

df['log_scores_probs'] = np.log1p(df['scores_probs'])

df['poly_2_scores_probs'] = df['scores_probs'] **2

df['poly_3_scores_probs'] = df['scores_probs'] **3

log yany

In [15]:
df['poly_2_average_coefficient'] = df['average_coefficient'] ** 2

In [16]:
df['poly_2_submission_per_problem'] = df['submission_per_problem'] ** 2

In [17]:
df['poly_2_average_coefficient'] = df['average_coefficient'] ** 2

In [18]:
df['submission_problems'] = df['total_problems'] * df['total_submission_count']
df['poly_2_submission_problems'] = df['submission_problems'] ** 2
df['log_submission_problems'] = np.log1p(df['submission_problems'])

In [19]:
df['log_average_coefficient'] = np.log1p(df['average_coefficient'])

In [20]:
# df.drop(columns='poly_2_scores_days', inplace=True)

df.head(4)

Unnamed: 0,username,total_problems,total_submission_count,average_pre_score,average_coefficient,submission_per_problem,total_days,hash,TH,poly_2_average_pre_score,...,square_score_x_days,scores_probs,log_scores_probs,poly_2_scores_probs,poly_3_scores_probs,poly_2_average_coefficient,poly_2_submission_per_problem,submission_problems,poly_2_submission_problems,log_submission_problems
0,00b6dd4fc7eb817e03708c532016ef30ce564a61,47,147,10000.0,100.0,0.319728,12,00b6dd4fc7eb817e03708c532016ef30ce564a61,5.0,100000000.0,...,346.410162,470000.0,13.06049,220900000000.0,1.03823e+17,10000.0,0.102226,6909,47734281,8.840725
1,00bef8afee8f3c595d535c9c03c490cac1a4f021,78,259,9535.24359,100.0,0.301158,17,00bef8afee8f3c595d535c9c03c490cac1a4f021,8.5,90920870.0,...,402.615376,743749.0,13.51946,553162600000.0,4.114141e+17,10000.0,0.090696,20202,408120804,9.913586
2,01122b3ef7e59b84189e65985305f575d6bdf83c,68,195,9201.529412,100.0,0.348718,21,01122b3ef7e59b84189e65985305f575d6bdf83c,7.0,84668140.0,...,439.581753,625704.0,13.346634,391505500000.0,2.449666e+17,10000.0,0.121604,13260,175827600,9.492583
3,013de369c439ab0ead8aa7da64423aa395a8be39,66,107,9017.651515,96.969697,0.616822,8,013de369c439ab0ead8aa7da64423aa395a8be39,10.0,81318040.0,...,268.591162,595165.0,13.296596,354221400000.0,2.108202e+17,9403.12213,0.38047,7062,49871844,8.862625


days ko the sai ko, ko the feature binh thuong /

ko the dung days_submission ko the /

ko the probs_submission 

ko the scores_days

In [21]:
columns_to_choose = ['poly_2_average_pre_score', 'average_pre_score', 'poly_3_average_pre_score', 'log_total_problems', 'poly_3_total_problems',
                     'poly_2_total_problems','log_submission_count', 'poly_2_average_coefficient', 'average_coefficient', 'submission_per_problem',
                    'poly_2_submission_per_problem', 'poly_3_scores_probs']
X = df[columns_to_choose]

y = df['TH']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [22]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

# Define the MLP Regressor model
mlp_model = MLPRegressor(
    hidden_layer_sizes=(20),  # Number of neurons in each layer
    activation='relu',                  # Activation function
    solver='adam',                      # Optimizer
    alpha=0.001,                        # L2 regularization
    batch_size=32,                      # Batch size
    max_iter=500,                        # Number of epochs
    random_state=1                      # Set random state for reproducibility
)

# Fit the model to the training data
mlp_model.fit(X_train, y_train)

# Predict using the trained model
y_pred = mlp_model.predict(X_test)

# Calculate R² score
r2 = r2_score(y_test, y_pred)

print(f'Columns: {", ".join(columns_to_choose)}')

# Example: After calculating R²
print(f'R^2: {r2}')


Columns: poly_2_average_pre_score, average_pre_score, poly_3_average_pre_score, log_total_problems, poly_3_total_problems, poly_2_total_problems, log_submission_count, poly_2_average_coefficient, average_coefficient, submission_per_problem, poly_2_submission_per_problem, poly_3_scores_probs
R^2: 0.4593777012243242


save model if it is okay

In [23]:
# model.save("model_th_2746.h5")

Load the model

In [24]:
# from tensorflow.keras.models import load_model
# from tensorflow.keras.metrics import MeanSquaredError

# # Load the model with custom_objects argument
# model = load_model('/kaggle/working/model_0.435.h5', custom_objects={'mse': MeanSquaredError()})

add new features in real test set

In [25]:
predict_df['poly_2_average_pre_score'] = predict_df['average_pre_score'] ** 2
predict_df['poly_3_average_pre_score'] = predict_df['average_pre_score'] ** 3

predict_df['log_total_problems'] = np.log(predict_df['total_problems'] + 1)

predict_df['poly_2_total_problems'] = predict_df['total_problems'] **2
predict_df['poly_3_total_problems'] = predict_df['total_problems'] **3

predict_df['poly_2_average_coefficient'] = predict_df['average_coefficient'] ** 2

predict_df['log_submission_count'] = np.log1p(predict_df['total_submission_count'])

predict_df['poly_2_submission_per_problem'] = predict_df['submission_per_problem'] ** 2

predict_df['scores_probs'] = predict_df['average_pre_score'] * predict_df['total_problems']

predict_df['poly_3_scores_probs'] = predict_df['scores_probs'] **3

predict_df.head(3)

Unnamed: 0,username,total_problems,total_submission_count,average_pre_score,average_coefficient,submission_per_problem,total_days,poly_2_average_pre_score,poly_3_average_pre_score,log_total_problems,poly_2_total_problems,poly_3_total_problems,poly_2_average_coefficient,log_submission_count,poly_2_submission_per_problem,scores_probs,poly_3_scores_probs
394,410357eb9129023509cfaf8d38be61c050bb3b05,16,43,9375.0,100.0,0.372093,5,87890625.0,823974600000.0,2.833213,256,4096,10000.0,3.78419,0.138453,150000.0,3375000000000000.0
625,67212308d026508fd5b6942ffbbdd7b0be2e89de,2,8,0.0,100.0,0.25,1,0.0,0.0,1.098612,4,8,10000.0,2.197225,0.0625,0.0,0.0
801,84a17972cc6d29489bbe205a9e7feb8745726fbc,110,182,10000.0,100.0,0.604396,31,100000000.0,1000000000000.0,4.70953,12100,1331000,10000.0,5.209486,0.365294,1100000.0,1.331e+18


In [26]:
# predict_df = predict_df.drop(columns=['predicted_scores'])  # Replace 'predicted_score' with your column name

In [27]:
X_predict = predict_df[columns_to_choose]
X_predict_scaled = scaler.fit_transform(X_predict)
y_real_pred = mlp_model.predict(X_predict_scaled)

# Assign the rounded values to the predicted_scores column
predict_df["predicted_scores"] = np.round(y_real_pred, 3)
predict_df.head(5)

Unnamed: 0,username,total_problems,total_submission_count,average_pre_score,average_coefficient,submission_per_problem,total_days,poly_2_average_pre_score,poly_3_average_pre_score,log_total_problems,poly_2_total_problems,poly_3_total_problems,poly_2_average_coefficient,log_submission_count,poly_2_submission_per_problem,scores_probs,poly_3_scores_probs,predicted_scores
394,410357eb9129023509cfaf8d38be61c050bb3b05,16,43,9375.0,100.0,0.372093,5,87890620.0,823974600000.0,2.833213,256,4096,10000.0,3.78419,0.138453,150000.0,3375000000000000.0,5.498
625,67212308d026508fd5b6942ffbbdd7b0be2e89de,2,8,0.0,100.0,0.25,1,0.0,0.0,1.098612,4,8,10000.0,2.197225,0.0625,0.0,0.0,6.761
801,84a17972cc6d29489bbe205a9e7feb8745726fbc,110,182,10000.0,100.0,0.604396,31,100000000.0,1000000000000.0,4.70953,12100,1331000,10000.0,5.209486,0.365294,1100000.0,1.331e+18,7.19
802,84b6b2d70924066c8345f2bc2281791ae3188da2,82,160,9594.317073,100.0,0.5125,30,92050920.0,883165700000.0,4.418841,6724,551368,10000.0,5.081404,0.262656,786734.0,4.869493e+17,7.507
803,851d9a4b9b8e236f2d62282ddf06fae57b7d9492,97,389,9632.216495,100.0,0.249357,27,92779590.0,893673100000.0,4.584967,9409,912673,10000.0,5.966147,0.062179,934325.0,8.156313e+17,8.971


In [28]:
output_df = predict_df[["username", "predicted_scores"]]

output_df.to_csv("output_th_again_2.csv", index=False, header=False)