### Loading Data 

In [39]:
import pandas as pd

# Load the dataset
df = pd.read_csv('synthetic_student_dataset.csv')  

In [33]:
# Defining features and target 
features = ['comprehension', 'attention', 'focus', 'retention', 'engagement_time']
target = 'assessment_score'

X = df[features]  # Features
y = df[target]    # Target

# Spliting data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Data split completed.")
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


Data split completed.
Training samples: 1600, Testing samples: 400


### LinearRegression Model

In [34]:
from sklearn.linear_model import LinearRegression

# Creating Linear Regression model
model = LinearRegression()

model.fit(X_train, y_train)

print("Model training completed.")

Model training completed.


In [35]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Model Evaluation Results:")
print(f"R² Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")


Model Evaluation Results:
R² Score: 0.8862
Mean Squared Error: 25.4685


### RandomForestRegressor Model

In [36]:
from sklearn.ensemble import RandomForestRegressor
l
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

r2_rf = r2_score(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)

print("Random Forest Model Evaluation:")
print(f"R² Score: {r2_rf:.4f}")
print(f"Mean Squared Error: {mse_rf:.4f}")


Random Forest Model Evaluation:
R² Score: 0.8615
Mean Squared Error: 30.9881


### Cluster students into learning personas.

In [38]:
from sklearn.preprocessing import StandardScaler

#features for clustering
cluster_features = ['comprehension', 'attention', 'focus', 'retention', 'engagement_time']

scaler = StandardScaler()
X_cluster = scaler.fit_transform(df[cluster_features])

print("Data prepared for clustering.")


Data prepared for clustering.


### K-Means Clustering

In [40]:
from sklearn.cluster import KMeans

# KMeans with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['learning_persona'] = kmeans.fit_predict(X_cluster)

print("K-Means clustering completed.")
df[['student_id', 'name', 'learning_persona']].head()


K-Means clustering completed.


Unnamed: 0,student_id,name,learning_persona
0,1,Casey Lloyd,0
1,2,Austin Montes,0
2,3,Olivia Martinez,1
3,4,Christopher Pratt,0
4,5,Tiffany Pace,2


In [41]:
# Group by cluster and calculate mean values of skills
persona_summary = df.groupby('learning_persona')[['comprehension', 'attention', 'focus', 'retention', 'engagement_time', 'assessment_score']].mean().round(2)

print("Learning Persona Summary (Cluster Averages):")
persona_summary


Learning Persona Summary (Cluster Averages):


Unnamed: 0_level_0,comprehension,attention,focus,retention,engagement_time,assessment_score
learning_persona,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,54.14,40.4,39.99,43.95,235.95,48.73
1,47.18,51.02,26.94,51.74,85.19,42.8
2,48.22,56.72,79.49,50.64,141.66,56.58
