In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# --- Part 1: Explicitly Create the Student Dataset ---

# Set a seed for reproducibility
np.random.seed(42)

# Number of students
num_students = 200

# Create features with some logical correlations
# Previous GPA (scale 1.0 to 4.0)
previous_gpa = np.random.normal(2.8, 0.6, num_students).clip(1.0, 4.0)

# Study hours influenced by GPA (better students might study more systematically)
study_hours_per_week = (previous_gpa * 2) + np.random.normal(0, 1, num_students).clip(1, 15)

# Attendance influenced by study hours
attendance_percentage = (study_hours_per_week * 3) + 40 + np.random.normal(0, 5, num_students)
attendance_percentage = attendance_percentage.clip(50, 100)

# Midterm score influenced by all previous factors
midterm_score = (previous_gpa * 10) + (study_hours_per_week * 1.5) + (attendance_percentage * 0.3) + np.random.normal(0, 4, num_students)
midterm_score = midterm_score.clip(0, 100)

# Create a DataFrame
data = pd.DataFrame({
    'previous_gpa': previous_gpa,
    'study_hours_per_week': study_hours_per_week,
    'attendance_percentage': attendance_percentage,
    'midterm_score': midterm_score
})

# Create the target variable based on a final score
# The final score is mostly based on the midterm score but with some variance
final_score = (midterm_score * 0.8) + (study_hours_per_week * 1.2) + np.random.normal(0, 5, num_students)
final_score = final_score.clip(0, 100)

# Define grade categories
def get_grade_category(score):
    if score < 50:
        return 'Fail'
    elif score <= 75:
        return 'Pass'
    else:
        return 'Distinction'

data['final_grade_category'] = [get_grade_category(score) for score in final_score]

# Display the first few rows of the dataset
print("Original Student Data:\n", data.head())
print("\nGrade Distribution:\n", data['final_grade_category'].value_counts())


# --- Part 2: Generate New Features (Feature Engineering) ---
# This is the section that directly mirrors the logic from the Iris dataset example.

# 1. Mathematical Combinations
# A feature representing overall academic engagement.
data['engagement_score'] = data['study_hours_per_week'] + (data['attendance_percentage'] / 10)
# An interaction between past performance and current effort.
data['gpa_study_interaction'] = data['previous_gpa'] * data['study_hours_per_week']

# 2. Interaction Features (Ratios)
# A feature representing "study efficiency" - score per hour of study.
# We use np.where to avoid division by zero.
data['study_efficiency_ratio'] = np.where(data['study_hours_per_week'] != 0, data['midterm_score'] / data['study_hours_per_week'], 0)

# 3. Polynomial Features (degree 2 for interaction terms)
# This automatically creates interaction terms (e.g., gpa * study_hours) and squared terms (e.g., gpa^2).
poly = PolynomialFeatures(degree=2, include_bias=False)
original_features = ['previous_gpa', 'study_hours_per_week', 'attendance_percentage', 'midterm_score']
poly_features = poly.fit_transform(data[original_features])
poly_feature_names = poly.get_feature_names_out(original_features)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)

# Combine polynomial features with the original dataframe
data = pd.concat([data.reset_index(drop=True), poly_df.reset_index(drop=True)], axis=1)

# Drop the original features as they are now included in the polynomial features
data.drop(original_features, axis=1, inplace=True)

print("\nData After Feature Engineering:\n", data.head())


# --- Part 3: Machine Learning Pipeline ---

# Convert categorical target labels to numbers (0, 1, 2)
le = LabelEncoder()
data['final_grade_category'] = le.fit_transform(data['final_grade_category'])

# Split Data into Features and Target
X = data.drop('final_grade_category', axis=1) # Features
y = data['final_grade_category'] # Target

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardize the Data (very important for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the SVM Model with RBF Kernel
model = SVC(kernel='rbf', random_state=42)
model.fit(X_train, y_train)

# Evaluate the Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# Use le.classes_ to get original string labels for the report
report = classification_report(y_test, y_pred, target_names=le.classes_,zero_division=0)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"Classification Report:\n{report}")


Original Student Data:
    previous_gpa  study_hours_per_week  attendance_percentage  midterm_score  \
0      3.098028              7.196057              53.616033      60.887135   
1      2.717041              6.434083              56.305373      50.024489   
2      3.188613              7.460277              62.407051      65.277086   
3      3.713818              8.481438              65.679217      74.986652   
4      2.659508              6.319016              56.706720      54.739359   

  final_grade_category  
0                 Pass  
1                 Fail  
2                 Pass  
3                 Pass  
4                 Fail  

Grade Distribution:
 final_grade_category
Pass           116
Fail            80
Distinction      4
Name: count, dtype: int64

Data After Feature Engineering:
   final_grade_category  engagement_score  gpa_study_interaction  \
0                 Pass         12.557660              22.293590   
1                 Fail         12.064620              17.