In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle

In [2]:
data = pd.read_csv('StudentsPerformance.csv')
print("✅ Loaded dataset with shape:", data.shape)

✅ Loaded dataset with shape: (1000, 8)


In [3]:
print("\n🔍 Checking for missing values:")
print(data.isnull().sum())


🔍 Checking for missing values:
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [4]:
data.dropna(inplace=True)


In [5]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

for col in ['math score', 'reading score', 'writing score']:
    data = remove_outliers(data, col)

print("✅ Shape after outlier removal:", data.shape)

✅ Shape after outlier removal: (986, 8)


In [6]:
scaler = StandardScaler()
data[['reading score', 'writing score']] = scaler.fit_transform(data[['reading score', 'writing score']])

In [7]:
data['average score'] = data[['math score', 'reading score', 'writing score']].mean(axis=1)


In [8]:
def score_category(score):
    if score < 40:
        return 'Low'
    elif score < 70:
        return 'Medium'
    else:
        return 'High'

data['math_score_class'] = data['math score'].apply(score_category)

In [9]:
le = LabelEncoder()
cat_cols = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
for col in cat_cols:
    data[col] = le.fit_transform(data[col])

In [10]:
target_le = LabelEncoder()
data['math_score_class'] = target_le.fit_transform(data['math_score_class'])

In [11]:
X = data[['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course',
          'reading score', 'writing score']]
y = data['math_score_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_le.classes_))



📊 Classification Report:
              precision    recall  f1-score   support

        High       0.89      0.79      0.84        75
         Low       1.00      0.22      0.36         9
      Medium       0.82      0.94      0.88       114

    accuracy                           0.85       198
   macro avg       0.91      0.65      0.69       198
weighted avg       0.86      0.85      0.84       198



In [14]:
pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(target_le, open('label_encoder.pkl', 'wb'))

print("\n✅ Model and label encoder saved as 'model.pkl' and 'label_encoder.pkl'")


✅ Model and label encoder saved as 'model.pkl' and 'label_encoder.pkl'


In [16]:
"Hyperparameter Tuning"

'Hyperparameter Tuning'

In [17]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Use GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

# Get best model
best_model = grid_search.best_estimator_
print("\n✅ Best Hyperparameters:", grid_search.best_params_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits

✅ Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict using best model
y_pred = best_model.predict(X_test)

# Evaluation Metrics
print("\n📈 Evaluation of Tuned Model:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))



📈 Evaluation of Tuned Model:
Accuracy: 0.8585858585858586
Precision: 0.8675990675990677
Recall: 0.8585858585858586
F1 Score: 0.8516783762685403


In [19]:
# Save best model instead of original one
pickle.dump(best_model, open('model.pkl', 'wb'))
pickle.dump(target_le, open('label_encoder.pkl', 'wb'))
print("\n✅ Tuned model saved to 'model.pkl'")



✅ Tuned model saved to 'model.pkl'
