In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load dataset
df = pd.read_csv("Updated_Dataset.csv")

In [3]:
df.head()

Unnamed: 0,No,Age,Male/Female,number sequences marks,number sequences time(s),number sequences duration,number sequences answered,perimeter marks,perimeter time(s),perimeter duration,...,Pre-Marks,Pure math marks (47),Applied math mark (53),No of questions attempted (21),Preferred Study Method,Preferred Lesson,Disliked lesson,fav subject,disliked subject,Peer
0,1,14,M,2,257,257,2,3.0,501,244,...,2,16.0,11.0,15,practicing,angles,set theory,sinhala,geography,24
1,2,14,M,2,212,212,2,3.0,410,198,...,2,19.0,11.0,17,Only school lessons,angles,fractions,sinhala,maths,24
2,3,14,M,2,169,169,2,3.0,357,188,...,9,22.5,17.0,17,Only school lessons,angles,area,english,science,20
3,4,14,M,3,188,188,2,3.0,623,435,...,5,21.0,20.0,18,Only school lessons,angles,none,none,none,24
4,5,14,F,4,253,253,2,2.0,560,307,...,10,18.0,5.0,14,Only school lessons,algebra,area,sinhala,health,10


In [4]:
# Check for NaN values
nan_columns = df.columns[df.isna().any()].tolist()
print("Columns with NaN values:", nan_columns)

# Fill NaN values in categorical columns with the most frequent value
for col in nan_columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

Columns with NaN values: ['indices time(s)', 'volume and capacity time(s)', 'area time(s)', 'probability time(s)']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [5]:
# Check for NaN values
nan_columns = df.columns[df.isna().any()].tolist()
print("Columns with NaN values:", nan_columns)

Columns with NaN values: []


In [6]:
# Define target variable
target_column = "Peer"

# Specify your selected feature columns
feature_columns = ["Age", "Male/Female", "number sequences marks", "number sequences time(s)", "perimeter marks", "perimeter time(s)", "ratio marks", "ratio time(s)", "fractions/decimals marks", "fractions/decimals time(s)", "indices marks", "indices time(s)", "algebra marks", "algebra time(s)", "angles marks", "angles time(s)", "volume and capacity marks", "volume and capacity time(s)", "area marks", "area time(s)", "probability marks", "probability time(s)", "Preferred Study Method"]

In [7]:
# Separate features and target
X = df[feature_columns]
y = df[target_column]

# Encode categorical variables
encoders = {}
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le

# Encode target variable
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])


In [8]:
# Scale numerical features
scaler = StandardScaler()
X[X.select_dtypes(include=['int64', 'float64']).columns] = scaler.fit_transform(
    X[X.select_dtypes(include=['int64', 'float64']).columns]
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[X.select_dtypes(include=['int64', 'float64']).columns] = scaler.fit_transform(


In [9]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

In [10]:
# Train and evaluate models
best_model = None
best_accuracy = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

Gradient Boosting Accuracy: 0.9933
Logistic Regression Accuracy: 0.7300
Decision Tree Accuracy: 0.9883


In [11]:
# Save the best model, encoder, and scaler
with open("best_model_peer.pkl", "wb") as model_file:
    pickle.dump(best_model, model_file)

with open("label_encoders_peer.pkl", "wb") as encoder_file:
    pickle.dump(encoders, encoder_file)

with open("scaler_peer.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

with open("y_encoders_peer.pkl", "wb") as encoder_file:
    pickle.dump(y_encoder, encoder_file)

print(f"Best model ({best_model.__class__.__name__}) saved with accuracy: {best_accuracy:.4f}")

Best model (GradientBoostingClassifier) saved with accuracy: 0.9933


In [13]:
import pickle
import pandas as pd
import os

# Load saved model, encoders, and scaler safely
def load_pickle(file_name):
    if os.path.exists(file_name):
        with open(file_name, "rb") as f:
            return pickle.load(f)
    else:
        raise FileNotFoundError(f"{file_name} not found!")

try:
    model = load_pickle("best_model_peer.pkl")
    encoders = load_pickle("label_encoders_peer.pkl")
    scaler = load_pickle("scaler_peer.pkl")
    y_encoder = load_pickle("y_encoders_peer.pkl")  # Load target variable encoder
except Exception as e:
    print("Error loading files:", e)

# Feature columns
feature_columns = [
    "Age", "Male/Female", "number sequences marks", "number sequences time(s)", "perimeter marks", "perimeter time(s)",
    "ratio marks", "ratio time(s)", "fractions/decimals marks", "fractions/decimals time(s)", "indices marks",
    "indices time(s)", "algebra marks", "algebra time(s)", "angles marks", "angles time(s)",
    "volume and capacity marks", "volume and capacity time(s)", "area marks", "area time(s)",
    "probability marks", "probability time(s)", "Preferred Study Method"
]

# Function to predict using sample input
def predict_class(sample):
    if model is None:
        raise ValueError("Model not loaded properly.")

    # Convert sample to DataFrame
    sample_df = pd.DataFrame([sample], columns=feature_columns)

    # Encode categorical variables
    for col in sample_df.select_dtypes(include=['object']).columns:
        if col in encoders:
            sample_df[col] = encoders[col].transform(sample_df[col])
        else:
            raise ValueError(f"Unknown categorical value found in column: {col}")

    # Scale numerical features
    sample_df[sample_df.select_dtypes(include=['int64', 'float64']).columns] = scaler.transform(
        sample_df[sample_df.select_dtypes(include=['int64', 'float64']).columns]
    )

    # Predict
    prediction = model.predict(sample_df)
    
    # Convert prediction back to original class labels
    predicted_label = y_encoder.inverse_transform([prediction[0]])[0]
    
    return predicted_label

# Example sample input (Make sure it includes only trained features)
sample_input = {
    "Age": 15, "Male/Female": "M", "number sequences marks": 75, "number sequences time(s)": 3, "perimeter marks": 80, "perimeter time(s)": 3,
    "ratio marks": 85, "ratio time(s)": 3, "fractions/decimals marks": 70, "fractions/decimals time(s)": 3, "indices marks": 60,
    "indices time(s)": 3, "algebra marks": 78, "algebra time(s)": 3, "angles marks": 82, "angles time(s)": 2,
    "volume and capacity marks": 88, "volume and capacity time(s)": 4, "area marks": 90, "area time(s)": 4,
    "probability marks": 77, "probability time(s)": 1, "Preferred Study Method": "figures"
}

try:
    predicted_class = predict_class(sample_input)
    print("Predicted Class:", predicted_class)
except Exception as e:
    print("Error:", e)


Predicted Class: 9
