In [None]:
# NeuroKnow: AI-Powered Learning Optimization for Neurodivergent Students
# Comprehensive Model Comparison Study

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Cell 1: Introduction
print("""
# NeuroKnow: AI-Powered Learning Optimization for Neurodivergent Students
## Predicting Optimal Learning Modalities Using Machine Learning

**Abstract** - This study compares multiple machine learning algorithms to identify the best model 
for predicting optimal learning pathways for neurodivergent students. By analyzing cognitive profiles, 
learning patterns, and performance data, we aim to create personalized educational recommendations 
that maximize learning efficiency and knowledge retention. The project evaluates eight different 
algorithms using cross-validation and ROC-AUC analysis to determine the most effective approach 
for educational personalization.

**Keywords** - neurodivergent learning, educational AI, personalized learning, cognitive profiling, 
machine learning, cross-validation, ROC analysis, modality optimization
""")

In [None]:
# Cell 2: Generate Synthetic Cognitive Dataset
def generate_neurodivergent_dataset(n_samples=1000):
    """Generate synthetic dataset representing neurodivergent learning profiles"""
    np.random.seed(42)
    
    data = {
        'age': np.random.randint(8, 18, n_samples),
        'attention_span': np.random.normal(25, 10, n_samples),  # minutes
        'working_memory': np.random.normal(6, 2, n_samples),    # digit span
        'processing_speed': np.random.normal(85, 15, n_samples), # standardized score
        'visual_learning_score': np.random.normal(70, 20, n_samples),
        'auditory_learning_score': np.random.normal(65, 18, n_samples),
        'kinesthetic_learning_score': np.random.normal(75, 22, n_samples),
        'logical_reasoning_score': np.random.normal(80, 15, n_samples),
        'error_recovery_rate': np.random.normal(0.6, 0.2, n_samples), # 0-1 scale
        'abstraction_ability': np.random.normal(70, 18, n_samples),
        'previous_success_rate': np.random.normal(0.7, 0.15, n_samples)
    }
    df = pd.DataFrame(data)
    
    # Create target variable: optimal learning modality (0: Visual, 1: Auditory, 2: Kinesthetic, 3: Logical)
    # Based on highest learning score with some noise
    scores = df[['visual_learning_score', 'auditory_learning_score', 
                 'kinesthetic_learning_score', 'logical_reasoning_score']].values
    optimal_modality = np.argmax(scores, axis=1)

    # Add some realistic noise
    noise = np.random.choice([-1, 0, 1], size=n_samples, p=[0.1, 0.8, 0.1])
    df['optimal_modality'] = (optimal_modality + noise) % 4

    # Add some missing values realistically (5% missing)
    for col in ['working_memory', 'processing_speed', 'error_recovery_rate']:
        mask = np.random.random(n_samples) < 0.05
        df.loc[mask, col] = np.nan

    return df

# Generate and display dataset
cognitive_df = generate_neurodivergent_dataset()
print("Dataset Overview:")
print(f"Shape: {cognitive_df.shape}")
print("\nFirst 5 rows:")
display(cognitive_df.head())

print("\nDataset Info:")
cognitive_df.info()

In [None]:
# Cell 3: Data Cleaning and Exploration
def handle_missing_data(df, strategy='mean'):
    """Handle missing values using specified strategy"""
    df_clean = df.copy()
    
    if strategy == 'mean':
        # Fill numerical columns with mean
        numerical_cols = ['working_memory', 'processing_speed', 'error_recovery_rate']
        for col in numerical_cols:
            df_clean[col].fillna(df_clean[col].mean(), inplace=True)
    else:  # drop
        df_clean = df_clean.dropna()
    
    return df_clean

# Create two datasets for comparison
df_mean_filled = handle_missing_data(cognitive_df, 'mean')
df_dropped = handle_missing_data(cognitive_df, 'drop')

print("Dataset sizes after preprocessing:")
print(f"Mean-filled: {df_mean_filled.shape}")
print(f"Dropped missing: {df_dropped.shape}")

# Visualize the distribution of optimal modalities
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
modality_names = ['Visual', 'Auditory', 'Kinesthetic', 'Logical']
df_mean_filled['optimal_modality'].value_counts().sort_index().plot(kind='bar')
plt.title('Optimal Modality Distribution (Mean-filled)')
plt.xticks(ticks=range(4), labels=modality_names, rotation=45)

plt.subplot(1, 3, 2)
df_dropped['optimal_modality'].value_counts().sort_index().plot(kind='bar')
plt.title('Optimal Modality Distribution (Dropped)')
plt.xticks(ticks=range(4), labels=modality_names, rotation=45)

