In [54]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [55]:
# Lists to store the inputs and labels
all_inputs = []
subject_labels = []
xeploai_labels = []

In [56]:
data_folder = '/kaggle/input/student-performance/data20/data'
# Number of past semesters to consider (you can adjust this based on your dataset)
maxlength = 14

In [57]:
# Iterate over each file in the folder
for filename in os.listdir(data_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path)

        # Drop 'IDLopHoc', 'IDSinhVien', 'HK' from input features
        input_data = df.drop(['IDSinhVien', 'TenDot', 'TenDotNumber'], axis=1)

        # Pad or truncate to the last `maxlength`
        if len(input_data) < maxlength:
            # Pad with zeros if less than `maxlength`
            padding = np.zeros((maxlength - len(input_data), input_data.shape[1]))
            input_data = np.vstack((input_data, padding))
        else:
            # Truncate to the last `maxlength`
            input_data = input_data[-maxlength:]

        # Extract labels: predict next semester's MaMonHoc and xeploai
        subject_label = df.iloc[-1]['MaMonHoc']  # Classification target (MaMonHoc)
        xeploai_label = df.iloc[-1]['xeploai']  # Classification target (xeploai)

        # Append the input and label data
        all_inputs.append(input_data)
        subject_labels.append(subject_label)
        xeploai_labels.append(xeploai_label)

In [58]:
# Combine all inputs into a final array
X = np.array(all_inputs)

# Flatten input for the model
X = X.reshape(X.shape[0], -1)
subject_labels = np.array(subject_labels)  # Classification labels (MaMonHoc)
xeploai_labels = np.array(xeploai_labels)  # Classification labels (xeploai)

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, subject_train, subject_test, xeploai_train, xeploai_test = train_test_split(
    X_scaled, subject_labels, xeploai_labels, test_size=0.2, random_state=42)

In [59]:
# Tìm số láng giềng tốt nhất cho mô hình MaMonHoc
param_grid = {'n_neighbors': np.arange(1, 10)}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid.fit(X_train, subject_train)

print(f'Best n_neighbors for MaMonHoc: {grid.best_params_}')

# Tương tự cho xeploai
grid.fit(X_train, xeploai_train)
print(f'Best n_neighbors for xeploai: {grid.best_params_}')



Best n_neighbors for MaMonHoc: {'n_neighbors': 1}
Best n_neighbors for xeploai: {'n_neighbors': 1}


In [60]:
# Initialize the KNN Classifier for predicting the subject (MaMonHoc)
knn_classifier_subject = KNeighborsClassifier(n_neighbors=1)

# Train the KNN classifier for MaMonHoc
knn_classifier_subject.fit(X_train, subject_train)

# Evaluate the classifier on the test set for MaMonHoc
subject_accuracy = knn_classifier_subject.score(X_test, subject_test)

# Initialize the KNN Classifier for predicting the xeploai
knn_classifier_xeploai = KNeighborsClassifier(n_neighbors=1)

# Train the KNN classifier for xeploai
knn_classifier_xeploai.fit(X_train, xeploai_train)

# Evaluate the classifier on the test set for xeploai
xeploai_accuracy = knn_classifier_xeploai.score(X_test, xeploai_test)

# Output the results
print(f'Accuracy of KNN Classifier (MaMonHoc): {subject_accuracy}')
print(f'Accuracy of KNN Classifier (xeploai): {xeploai_accuracy}')


Accuracy of KNN Classifier (MaMonHoc): 0.700542392235227
Accuracy of KNN Classifier (xeploai): 0.8843848130174137


In [61]:
def predict(file_path):
    df = pd.read_csv(file_path)
    input_data = df.drop(['IDSinhVien', 'TenDot', 'TenDotNumber'], axis=1)
    padding = np.zeros((maxlength - len(input_data), input_data.shape[1]))
    input_data = np.vstack((input_data, padding))
    input_data = input_data[-maxlength:]
    X = np.array([input_data])
    X = X.reshape(X.shape[0], -1)
    X_scaled = scaler.transform(X)
    subject_label = df.iloc[-1]['MaMonHoc']
    xeploai_label = df.iloc[-1]['xeploai']
   # Predict the next semester's MaMonHoc
    predicted_subject = knn_classifier_subject.predict(X_scaled)

    # Predict the next semester's xeploai
    predicted_xeploai = knn_classifier_xeploai.predict(X_scaled)
    # Output the predictions
    print(f'Predicted MaMonHoc: {predicted_subject[0]}')
    print(f'Predicted xeploai: {predicted_xeploai[0]}')

In [63]:
file_path = '/kaggle/input/student-performance/data20/test/dataframe_13507.csv'
predict(file_path)

Predicted MaMonHoc: 2031
Predicted xeploai: 1
