In [None]:
# Cognitive Skills & Student Performance Analysis

This notebook analyzes the relationship between cognitive skills and student performance, builds ML models for prediction, and performs clustering analysis to identify learning personas.

## Table of Contents
1. [Data Generation and Loading](#data-generation)
2. [Exploratory Data Analysis](#eda)
3. [Correlation Analysis](#correlation)
4. [Machine Learning Model](#ml-model)
5. [Student Clustering](#clustering)
6. [Key Insights and Findings](#insights)


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


In [None]:
## 1. Data Generation and Loading {#data-generation}

First, let's generate synthetic student data with realistic cognitive skills and performance metrics.


In [None]:
# Generate synthetic student data
def generate_student_data(n_students=200):
    """Generate synthetic student data with cognitive skills and performance metrics"""
    
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Student IDs and names
    student_ids = [f"STU{i:03d}" for i in range(1, n_students + 1)]
    
    # Generate names
    first_names = [
        "Alex", "Jordan", "Taylor", "Morgan", "Casey", "Riley", "Avery", "Quinn",
        "Blake", "Cameron", "Drew", "Emery", "Finley", "Hayden", "Jamie", "Kendall",
        "Lane", "Parker", "Reese", "Sage", "Skyler", "Tatum", "River", "Phoenix",
        "Sam", "Charlie", "Dakota", "Emerson", "Frankie", "Gray", "Harper", "Indigo",
        "Jules", "Kai", "Lennox", "Marlowe", "Noah", "Ocean", "Peyton", "Rowan"
    ]
    
    last_names = [
        "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis",
        "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas",
        "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson", "White",
        "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson", "Walker", "Young",
        "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill", "Flores"
    ]
    
    names = [f"{np.random.choice(first_names)} {np.random.choice(last_names)}" for _ in range(n_students)]
    
    # Generate classes (grades 9-12)
    classes = np.random.choice([9, 10, 11, 12], n_students, p=[0.25, 0.25, 0.25, 0.25])
    
    # Generate cognitive skills (0-100 scale)
    # Create some correlation between skills
    base_skill = np.random.normal(70, 15, n_students)
    base_skill = np.clip(base_skill, 20, 95)
    
    # Comprehension (correlated with base skill)
    comprehension = base_skill + np.random.normal(0, 8, n_students)
    comprehension = np.clip(comprehension, 20, 100)
    
    # Attention (slightly different distribution)
    attention = base_skill + np.random.normal(0, 10, n_students)
    attention = np.clip(attention, 15, 100)
    
    # Focus (correlated with attention)
    focus = attention + np.random.normal(0, 6, n_students)
    focus = np.clip(focus, 20, 100)
    
    # Retention (correlated with comprehension)
    retention = comprehension + np.random.normal(0, 7, n_students)
    retention = np.clip(retention, 25, 100)
    
    # Engagement time (minutes per week, correlated with attention)
    engagement_time = (attention * 0.8 + np.random.normal(0, 15, n_students)).astype(int)
    engagement_time = np.clip(engagement_time, 30, 300)
    
    # Assessment score (correlated with all cognitive skills)
    # Weight the skills differently
    assessment_score = (
        comprehension * 0.3 +
        attention * 0.25 +
        focus * 0.2 +
        retention * 0.25 +
        np.random.normal(0, 5, n_students)
    )
    assessment_score = np.clip(assessment_score, 0, 100)
    
    # Create DataFrame
    data = {
        'student_id': student_ids,
        'name': names,
        'class': classes,
        'comprehension': np.round(comprehension, 1),
        'attention': np.round(attention, 1),
        'focus': np.round(focus, 1),
        'retention': np.round(retention, 1),
        'assessment_score': np.round(assessment_score, 1),
        'engagement_time': engagement_time
    }
    
    df = pd.DataFrame(data)
    
    # Add some learning personas based on skill patterns
    def assign_persona(row):
        if row['comprehension'] > 80 and row['attention'] > 80:
            return 'High Achiever'
        elif row['attention'] < 50 and row['focus'] < 50:
            return 'Distracted Learner'
        elif row['retention'] > 85 and row['comprehension'] > 75:
            return 'Analytical Thinker'
        elif row['engagement_time'] > 200 and row['attention'] > 70:
            return 'Engaged Explorer'
        elif row['comprehension'] < 60 and row['retention'] < 60:
            return 'Struggling Student'
        else:
            return 'Balanced Learner'
    
    df['learning_persona'] = df.apply(assign_persona, axis=1)
    
    return df

# Generate the dataset
print("Generating synthetic student dataset...")
student_df = generate_student_data(200)

# Save to CSV
student_df.to_csv('../data/student_data.csv', index=False)
print(f"Dataset saved with {len(student_df)} students")

# Display basic statistics
print("\nDataset Overview:")
print(f"Shape: {student_df.shape}")
print(f"\nColumns: {list(student_df.columns)}")
print(f"\nLearning Personas Distribution:")
print(student_df['learning_persona'].value_counts())
print(f"\nClass Distribution:")
print(student_df['class'].value_counts().sort_index())


In [None]:
# Display sample data
print("Sample Data:")
print(student_df.head(10))

# Display correlation matrix
print("\nCorrelation Matrix (Cognitive Skills vs Assessment Score):")
correlation_cols = ['comprehension', 'attention', 'focus', 'retention', 'assessment_score']
correlation_matrix = student_df[correlation_cols].corr()
print(correlation_matrix.round(3))


In [None]:
## 2. Exploratory Data Analysis {#eda}

Let's explore the dataset to understand the distribution of cognitive skills and performance metrics.


In [None]:
# Create comprehensive EDA plots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Cognitive Skills & Performance Distribution Analysis', fontsize=16, fontweight='bold')

# 1. Assessment Score Distribution
axes[0, 0].hist(student_df['assessment_score'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Assessment Score Distribution')
axes[0, 0].set_xlabel('Assessment Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(student_df['assessment_score'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {student_df["assessment_score"].mean():.1f}')
axes[0, 0].legend()

# 2. Cognitive Skills Box Plot
skills_data = [student_df['comprehension'], student_df['attention'], 
               student_df['focus'], student_df['retention']]
skills_labels = ['Comprehension', 'Attention', 'Focus', 'Retention']
box_plot = axes[0, 1].boxplot(skills_data, labels=skills_labels, patch_artist=True)
colors = ['lightblue', 'lightgreen', 'lightcoral', 'lightyellow']
for patch, color in zip(box_plot['boxes'], colors):
    patch.set_facecolor(color)
axes[0, 1].set_title('Cognitive Skills Distribution')
axes[0, 1].set_ylabel('Score')

# 3. Learning Personas Distribution
persona_counts = student_df['learning_persona'].value_counts()
axes[0, 2].pie(persona_counts.values, labels=persona_counts.index, autopct='%1.1f%%', startangle=90)
axes[0, 2].set_title('Learning Personas Distribution')

# 4. Class Distribution
class_counts = student_df['class'].value_counts().sort_index()
axes[1, 0].bar(class_counts.index, class_counts.values, color='lightsteelblue', edgecolor='black')
axes[1, 0].set_title('Students by Class/Grade')
axes[1, 0].set_xlabel('Class')
axes[1, 0].set_ylabel('Number of Students')

# 5. Engagement Time vs Assessment Score
scatter = axes[1, 1].scatter(student_df['engagement_time'], student_df['assessment_score'], 
                            c=student_df['assessment_score'], cmap='viridis', alpha=0.6)
axes[1, 1].set_title('Engagement Time vs Assessment Score')
axes[1, 1].set_xlabel('Engagement Time (minutes/week)')
axes[1, 1].set_ylabel('Assessment Score')
plt.colorbar(scatter, ax=axes[1, 1], label='Assessment Score')

# 6. Skills Correlation Heatmap
correlation_matrix = student_df[['comprehension', 'attention', 'focus', 'retention', 'assessment_score']].corr()
im = axes[1, 2].imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
axes[1, 2].set_xticks(range(len(correlation_matrix.columns)))
axes[1, 2].set_yticks(range(len(correlation_matrix.columns)))
axes[1, 2].set_xticklabels(correlation_matrix.columns, rotation=45)
axes[1, 2].set_yticklabels(correlation_matrix.columns)
axes[1, 2].set_title('Skills Correlation Heatmap')

# Add correlation values to heatmap
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        text = axes[1, 2].text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}',
                              ha="center", va="center", color="black", fontweight='bold')

plt.tight_layout()
plt.show()

# Display summary statistics
print("Summary Statistics:")
print(student_df.describe().round(2))


In [None]:
## 3. Correlation Analysis {#correlation}

Let's analyze the correlations between cognitive skills and performance metrics.


In [None]:
# Detailed correlation analysis
correlation_cols = ['comprehension', 'attention', 'focus', 'retention', 'assessment_score', 'engagement_time']
correlation_matrix = student_df[correlation_cols].corr()

# Create a more detailed correlation heatmap
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, fmt='.3f')
plt.title('Detailed Correlation Matrix: Cognitive Skills & Performance', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Analyze correlations with assessment score
assessment_correlations = correlation_matrix['assessment_score'].drop('assessment_score').sort_values(ascending=False)
print("Correlation with Assessment Score (sorted by strength):")
print(assessment_correlations.round(3))

# Create scatter plots for each skill vs assessment score
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Individual Cognitive Skills vs Assessment Score', fontsize=16, fontweight='bold')

skills = ['comprehension', 'attention', 'focus', 'retention']
colors = ['skyblue', 'lightgreen', 'lightcoral', 'lightyellow']

for i, (skill, color) in enumerate(zip(skills, colors)):
    row, col = i // 2, i % 2
    axes[row, col].scatter(student_df[skill], student_df['assessment_score'], 
                          alpha=0.6, color=color, edgecolor='black', s=50)
    
    # Add trend line
    z = np.polyfit(student_df[skill], student_df['assessment_score'], 1)
    p = np.poly1d(z)
    axes[row, col].plot(student_df[skill], p(student_df[skill]), "r--", alpha=0.8, linewidth=2)
    
    # Calculate and display correlation
    corr = student_df[skill].corr(student_df['assessment_score'])
    axes[row, col].set_title(f'{skill.title()} vs Assessment Score\n(r = {corr:.3f})')
    axes[row, col].set_xlabel(f'{skill.title()} Score')
    axes[row, col].set_ylabel('Assessment Score')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
## 4. Machine Learning Model {#ml-model}

Let's build ML models to predict assessment scores based on cognitive skills.


In [None]:
# Prepare data for ML models
X = student_df[['comprehension', 'attention', 'focus', 'retention', 'engagement_time', 'class']]
y = student_df['assessment_score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Train multiple models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

model_results = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    
    model_results[name] = {
        'model': model,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'y_pred_test': y_pred_test
    }
    
    print(f"\n{name} Results:")
    print(f"  Training R²: {train_r2:.3f}")
    print(f"  Test R²: {test_r2:.3f}")
    print(f"  Training RMSE: {train_rmse:.3f}")
    print(f"  Test RMSE: {test_rmse:.3f}")
    print(f"  Training MAE: {train_mae:.3f}")
    print(f"  Test MAE: {test_mae:.3f}")

# Visualize model performance
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

for i, (name, results) in enumerate(model_results.items()):
    axes[i].scatter(y_test, results['y_pred_test'], alpha=0.6, s=50)
    axes[i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[i].set_xlabel('Actual Assessment Score')
    axes[i].set_ylabel('Predicted Assessment Score')
    axes[i].set_title(f'{name}\nR² = {results["test_r2"]:.3f}, RMSE = {results["test_rmse"]:.3f}')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Feature importance for Random Forest
if 'Random Forest' in model_results:
    rf_model = model_results['Random Forest']['model']
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
    plt.title('Feature Importance - Random Forest Model', fontsize=14, fontweight='bold')
    plt.xlabel('Feature Importance')
    plt.tight_layout()
    plt.show()
    
    print("\nFeature Importance (Random Forest):")
    print(feature_importance)


In [None]:
## 5. Student Clustering {#clustering}

Let's cluster students into learning personas using K-means clustering.


In [None]:
# Prepare data for clustering
clustering_features = ['comprehension', 'attention', 'focus', 'retention', 'engagement_time', 'assessment_score']
X_cluster = student_df[clustering_features]

# Standardize the features
scaler = StandardScaler()
X_cluster_scaled = scaler.fit_transform(X_cluster)

# Determine optimal number of clusters using elbow method
inertias = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_cluster_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True, alpha=0.3)
plt.show()

# Perform K-means clustering with k=6 (based on our predefined personas)
kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_cluster_scaled)

# Add cluster labels to dataframe
student_df['cluster'] = cluster_labels

# Analyze clusters
cluster_analysis = student_df.groupby('cluster')[clustering_features].mean().round(2)
print("Cluster Analysis - Average Values:")
print(cluster_analysis)

# Compare with predefined personas
persona_cluster_mapping = student_df.groupby(['learning_persona', 'cluster']).size().unstack(fill_value=0)
print("\nPersona vs Cluster Mapping:")
print(persona_cluster_mapping)

# Visualize clusters
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Student Clusters Analysis', fontsize=16, fontweight='bold')

# 1. Comprehension vs Attention
scatter = axes[0, 0].scatter(student_df['comprehension'], student_df['attention'], 
                            c=student_df['cluster'], cmap='tab10', alpha=0.7, s=60)
axes[0, 0].set_xlabel('Comprehension')
axes[0, 0].set_ylabel('Attention')
axes[0, 0].set_title('Comprehension vs Attention (Clusters)')
plt.colorbar(scatter, ax=axes[0, 0], label='Cluster')

# 2. Focus vs Retention
scatter = axes[0, 1].scatter(student_df['focus'], student_df['retention'], 
                            c=student_df['cluster'], cmap='tab10', alpha=0.7, s=60)
axes[0, 1].set_xlabel('Focus')
axes[0, 1].set_ylabel('Retention')
axes[0, 1].set_title('Focus vs Retention (Clusters)')
plt.colorbar(scatter, ax=axes[0, 1], label='Cluster')

# 3. Engagement Time vs Assessment Score
scatter = axes[1, 0].scatter(student_df['engagement_time'], student_df['assessment_score'], 
                            c=student_df['cluster'], cmap='tab10', alpha=0.7, s=60)
axes[1, 0].set_xlabel('Engagement Time')
axes[1, 0].set_ylabel('Assessment Score')
axes[1, 0].set_title('Engagement Time vs Assessment Score (Clusters)')
plt.colorbar(scatter, ax=axes[1, 0], label='Cluster')

# 4. Cluster distribution
cluster_counts = student_df['cluster'].value_counts().sort_index()
axes[1, 1].bar(cluster_counts.index, cluster_counts.values, color='lightsteelblue', edgecolor='black')
axes[1, 1].set_xlabel('Cluster')
axes[1, 1].set_ylabel('Number of Students')
axes[1, 1].set_title('Cluster Distribution')
axes[1, 1].set_xticks(range(6))

plt.tight_layout()
plt.show()

# Create cluster profiles
print("\nCluster Profiles:")
for cluster_id in range(6):
    cluster_data = student_df[student_df['cluster'] == cluster_id]
    print(f"\nCluster {cluster_id} ({len(cluster_data)} students):")
    print(f"  Average Comprehension: {cluster_data['comprehension'].mean():.1f}")
    print(f"  Average Attention: {cluster_data['attention'].mean():.1f}")
    print(f"  Average Focus: {cluster_data['focus'].mean():.1f}")
    print(f"  Average Retention: {cluster_data['retention'].mean():.1f}")
    print(f"  Average Assessment Score: {cluster_data['assessment_score'].mean():.1f}")
    print(f"  Average Engagement Time: {cluster_data['engagement_time'].mean():.1f}")
    print(f"  Most Common Persona: {cluster_data['learning_persona'].mode().iloc[0]}")


In [None]:
## 6. Key Insights and Findings {#insights}

Let's summarize the key findings from our analysis.


In [None]:
# Generate comprehensive insights
print("=" * 60)
print("COGNITIVE SKILLS & STUDENT PERFORMANCE ANALYSIS - KEY FINDINGS")
print("=" * 60)

# 1. Overall Performance Statistics
print("\n1. OVERALL PERFORMANCE STATISTICS:")
print(f"   • Total Students Analyzed: {len(student_df)}")
print(f"   • Average Assessment Score: {student_df['assessment_score'].mean():.1f} ± {student_df['assessment_score'].std():.1f}")
print(f"   • Score Range: {student_df['assessment_score'].min():.1f} - {student_df['assessment_score'].max():.1f}")

# 2. Cognitive Skills Analysis
print("\n2. COGNITIVE SKILLS ANALYSIS:")
skills_stats = student_df[['comprehension', 'attention', 'focus', 'retention']].describe()
for skill in ['comprehension', 'attention', 'focus', 'retention']:
    mean_val = skills_stats.loc['mean', skill]
    std_val = skills_stats.loc['std', skill]
    print(f"   • {skill.title()}: {mean_val:.1f} ± {std_val:.1f}")

# 3. Correlation Insights
print("\n3. CORRELATION INSIGHTS:")
correlation_with_score = student_df[['comprehension', 'attention', 'focus', 'retention', 'engagement_time']].corrwith(student_df['assessment_score']).sort_values(ascending=False)
for skill, corr in correlation_with_score.items():
    strength = "Strong" if abs(corr) > 0.7 else "Moderate" if abs(corr) > 0.5 else "Weak"
    print(f"   • {skill.title()}: {corr:.3f} ({strength} correlation)")

# 4. Learning Personas Distribution
print("\n4. LEARNING PERSONAS DISTRIBUTION:")
persona_counts = student_df['learning_persona'].value_counts()
for persona, count in persona_counts.items():
    percentage = (count / len(student_df)) * 100
    print(f"   • {persona}: {count} students ({percentage:.1f}%)")

# 5. Model Performance Summary
print("\n5. MACHINE LEARNING MODEL PERFORMANCE:")
if 'Random Forest' in model_results:
    rf_results = model_results['Random Forest']
    print(f"   • Best Model: Random Forest")
    print(f"   • R² Score: {rf_results['test_r2']:.3f}")
    print(f"   • RMSE: {rf_results['test_rmse']:.3f}")
    print(f"   • MAE: {rf_results['test_mae']:.3f}")

# 6. Key Recommendations
print("\n6. KEY RECOMMENDATIONS:")
print("   • Focus on comprehension and retention skills as they show strongest correlation with performance")
print("   • Implement targeted interventions for 'Struggling Students' and 'Distracted Learners'")
print("   • Leverage 'High Achievers' and 'Analytical Thinkers' as peer mentors")
print("   • Monitor engagement time as it correlates with attention and performance")
print("   • Use ML models to predict at-risk students early for intervention")

# 7. Class-wise Analysis
print("\n7. CLASS-WISE PERFORMANCE:")
class_performance = student_df.groupby('class')['assessment_score'].agg(['mean', 'std', 'count']).round(2)
for class_id, stats in class_performance.iterrows():
    print(f"   • Class {class_id}: {stats['mean']:.1f} ± {stats['std']:.1f} ({stats['count']} students)")

print("\n" + "=" * 60)
print("ANALYSIS COMPLETE - Ready for Dashboard Implementation")
print("=" * 60)

# Save the final dataset with all analysis
student_df.to_csv('../data/student_data_with_analysis.csv', index=False)
print(f"\nFinal dataset with analysis saved to: ../data/student_data_with_analysis.csv")
