In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from xgboost import XGBClassifier

# Load your dataset
data = pd.read_csv('augmented_dataset.csv')

X = data.drop('Grade', axis=1)  # Features
y = data['Grade'] 
y = y - 1

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define hyperparameters for manual tuning
params = {
    'n_estimators': [100, 200],#100
    'max_depth': [3, 5], #3
    'learning_rate': [0.1, 0.3],#0.1 
    'subsample': [0.8, 1.0], #1.0
    'colsample_bytree': [0.8, 1.0], #1.0
    'gamma': [0, 0.1],#0
    
}

best_accuracy = 0
best_model = None
best_parameters = None

# Define KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True,random_state=42)

# Perform manual parameter tuning and cross-validation
for n_estimators in params['n_estimators']:
    for max_depth in params['max_depth']:
        for learning_rate in params['learning_rate']:
            for subsample in params['subsample']:
                for colsample_bytree in params['colsample_bytree']:
                    for gamma in params['gamma']:
                        
                                print(f"Evaluating model with n_estimators={n_estimators}, max_depth={max_depth}, learning_rate={learning_rate},subsample={subsample}, colsample_bytree={colsample_bytree},gamma={gamma}")
                                
                                accuracy_values = []
                                for train_index, val_index in kfold.split(X_train_scaled):
                                    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
                                    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                                    
                                    # Create the XGBoost model
                                    model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, 
                                                          subsample=subsample, colsample_bytree=colsample_bytree)
                                    
                                    # Fit the model
                                    model.fit(X_train_fold, y_train_fold)
                                    
                                    # Evaluate the model on validation set
                                    y_pred = model.predict(X_val_fold)
                                    accuracy = accuracy_score(y_val_fold, y_pred)
                                    accuracy_values.append(accuracy)
                                
                                mean_accuracy = np.mean(accuracy_values)
                                print(f"Mean Accuracy: {mean_accuracy}")
                                
                                # Update best model if the current model is better
                                if mean_accuracy > best_accuracy:
                                    best_accuracy = mean_accuracy
                                    best_model = model
                                    best_parameters = {'n_estimators': n_estimators, 'max_depth': max_depth, 'learning_rate': learning_rate, 
                                                       'subsample': subsample, 'colsample_bytree': colsample_bytree}


# Print evaluation metrics and best parameters
print("Best Parameters:", best_parameters)
print("Best Accuracy:", best_accuracy)

best_model.save_model('xgboost_model.json')


Evaluating model with n_estimators=100, max_depth=3, learning_rate=0.1,subsample=0.8, colsample_bytree=0.8,gamma=0
Mean Accuracy: 0.8608814589665654
Evaluating model with n_estimators=100, max_depth=3, learning_rate=0.1,subsample=0.8, colsample_bytree=0.8,gamma=0.1
Mean Accuracy: 0.8608814589665654
Evaluating model with n_estimators=100, max_depth=3, learning_rate=0.1,subsample=0.8, colsample_bytree=1.0,gamma=0
Mean Accuracy: 0.8651469098277609
Evaluating model with n_estimators=100, max_depth=3, learning_rate=0.1,subsample=0.8, colsample_bytree=1.0,gamma=0.1
Mean Accuracy: 0.8651469098277609
Evaluating model with n_estimators=100, max_depth=3, learning_rate=0.1,subsample=1.0, colsample_bytree=0.8,gamma=0
Mean Accuracy: 0.8437993920972644
Evaluating model with n_estimators=100, max_depth=3, learning_rate=0.1,subsample=1.0, colsample_bytree=0.8,gamma=0.1
Mean Accuracy: 0.8437993920972644
Evaluating model with n_estimators=100, max_depth=3, learning_rate=0.1,subsample=1.0, colsample_bytr

In [2]:

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Best Parameters:", best_parameters)
print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Best Parameters: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.3, 'subsample': 0.8, 'colsample_bytree': 0.8}
Test Accuracy: 0.9090909090909091
Precision: 0.9106377483734834
Recall: 0.9090909090909091




- If the error impact value is positive, it means that perturbing that feature led to an increase in model performance (lower error). This suggests that the feature might not be very important, as changing it doesn't significantly affect the model's predictions.
  
- If the error impact value is negative, it means that perturbing that feature led to a decrease in model performance (higher error). This indicates that the feature is important, as changing it has a notable impact on the model's predictions.

