In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [11]:
data= pd.read_csv('student4 (1) .csv')

In [12]:
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,Langauge,Maths,Science,Percentage
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,30,25,30,28.333333
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,30,25,25,26.666667
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,50,35,40,41.666667
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,75,75,70,73.333333
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,50,30,50,43.333333


In [13]:
categorical_columns = data.select_dtypes(include=['object']).columns

# Apply frequency encoding to categorical columns
for col in categorical_columns:
    freq = data[col].value_counts()
    data[col] = data[col].map(freq)

# Identify numerical columns (excluding 'Percentage')
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop('Percentage')

# Apply StandardScaler to numerical columns
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Split the data into training and test sets
X = data.drop('Percentage', axis=1)
y = data['Percentage']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
#1. Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 3.7707208161421697
R^2 Score: 0.9898115687840313


In [15]:
# 2. Gradient Boosting
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Create the ensemble model
model1 = RandomForestRegressor(random_state=42)
model2 = GradientBoostingRegressor(random_state=42)

ensemble_model = VotingRegressor(estimators=[('rf', model1), ('gbr', model2)])
ensemble_model.fit(X_train, y_train)

# Evaluate the model
y_pred = ensemble_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 1.5673574022720695
R^2 Score: 0.995765023754735


In [16]:
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data again if needed
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create simpler models with more regularization
model1 = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
model2 = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05, max_depth=3, random_state=42)
model3 = Ridge(alpha=1.0)

# Create the ensemble model with a more diverse set of models
ensemble_model = VotingRegressor(estimators=[('rf', model1), ('gbr', model2), ('ridge', model3)])
ensemble_model.fit(X_train, y_train)

# Cross-validation to check performance consistency
cv_scores = cross_val_score(ensemble_model, X_train, y_train, cv=5, scoring='r2')
print(f'Cross-Validation R^2 Scores: {cv_scores}')
print(f'Average Cross-Validation R^2 Score: {cv_scores.mean()}')

# Evaluate the model on the test set
y_pred = ensemble_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Cross-Validation R^2 Scores: [0.99610809 0.99525913 0.99614152 0.99507461 0.99642628]
Average Cross-Validation R^2 Score: 0.9958019225560502
Mean Squared Error: 1.5838091582964025
R^2 Score: 0.9957205713561595


In [17]:
# Install SHAP

import pandas as pd
import joblib

# Initialize the SHAP explainer

# Load the original dataset
file_path = 'student4 (1) .csv'
data = pd.read_csv(file_path)

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Create frequency encoding map
frequency_encoding_map = {col: data[col].value_counts().to_dict() for col in categorical_columns}

# Save the frequency encoding map
joblib.dump(frequency_encoding_map, 'frequency_encoding_map.pkl')

# Example new data
new_data_dict = {
    'school': ['GP'],
    'sex': ['F'],
    'age': [17],
    'address': ['U'],
    'famsize': ['GT3'],
    'Pstatus': ['A'],
    'Medu': [4],
    'Fedu': [4],
    'Mjob': ['health'],
    'Fjob': ['services'],
    'reason': ['course'],
    'guardian': ['mother'],
    'traveltime': [1],
    'studytime': [2],
    'failures': [10],
    'schoolsup': ['yes'],
    'famsup': ['no'],
    'paid': ['no'],
    'activities': ['yes'],
    'nursery': ['yes'],
    'higher': ['yes'],
    'internet': ['yes'],
    'romantic': ['no'],
    'famrel': [4],
    'freetime': [3],
    'goout': [4],
    'Dalc': [1],
    'Walc': [1],
    'health': [3],
    'absences': [44],
    'Langauge': [10],
    'Maths': [100],
    'Science': [100],
}

# Convert to DataFrame
new_data = pd.DataFrame(new_data_dict)

# Apply frequency encoding to new data
for col in categorical_columns:
    new_data[col] = new_data[col].map(frequency_encoding_map.get(col, {})).fillna(0)  # Fill NaN with 0 if there are unseen categories

# Identify numerical columns
numerical_columns = new_data.select_dtypes(include=['int64', 'float64']).columns

# Apply StandardScaler to normalize the numerical features
# Assuming scaler is already defined and fitted on training data
new_data[numerical_columns] = scaler.transform(new_data[numerical_columns])

# Ensure the columns are in the same order as the training data
expected_columns = [
    'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
    'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures',
    'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet',
    'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health',
    'absences', 'Langauge', 'Maths', 'Science'
]
new_data = new_data[expected_columns]

# Predict with new data
predictions = ensemble_model.predict(new_data)
print(predictions)


[69.62064285]


In [18]:
# Assuming predictions is the calculated percentage from your model

# Ensure predictions is a scalar value (if it's a numpy array, take the first element for simplicity)
predicted_percentage = predictions if isinstance(predictions, (int, float)) else predictions[0]

output_message = ""

if predicted_percentage < 10:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This suggests the student's performance needs significant improvement. Consider reviewing the input data or providing additional support and resources."
elif predicted_percentage >= 10 and predicted_percentage < 25:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This indicates below-average performance. Consider further optimization of study habits and additional support."
elif predicted_percentage >= 25 and predicted_percentage < 35:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This suggests average performance. Continue to monitor progress and consider areas for improvement."
elif predicted_percentage >= 35 and predicted_percentage < 45:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This indicates slightly above-average performance. Encourage continued effort and focus."
elif predicted_percentage >= 45 and predicted_percentage < 50:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This suggests good performance. Maintain focus and seek opportunities for further improvement."
elif predicted_percentage >= 50 and predicted_percentage < 65:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This indicates very good performance. Continue to excel and explore advanced topics."
elif predicted_percentage >= 65 and predicted_percentage < 75:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This suggests excellent performance. Well done! Maintain consistent effort."
elif predicted_percentage >= 75 and predicted_percentage < 85:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This indicates outstanding performance. Keep up the exceptional work."
elif predicted_percentage >= 85 and predicted_percentage < 90:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This suggests exceptional performance. Congratulations on your achievements!"
elif predicted_percentage >= 90 and predicted_percentage < 95:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This indicates outstanding performance. You are achieving at a very high level."
else:
    output_message = f"The predicted percentage is {predicted_percentage:.2f}%. This suggests exceptional performance. You are excelling in your studies!"

print(output_message)
print("Predicted Percentage:", predicted_percentage)


The predicted percentage is 69.62%. This suggests excellent performance. Well done! Maintain consistent effort.
Predicted Percentage: 69.62064285078041


In [19]:
%pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [23]:
import joblib
joblib.dump(ensemble_model, 'ensemble_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']