In [16]:
# Install required packages using magic commands
import sys
!{sys.executable} -m pip install pandas>=1.5.0 numpy>=1.21.0 matplotlib>=3.5.0 seaborn>=0.11.0 scikit-learn>=1.1.0 --quiet

print("✅ Packages installed successfully!")


✅ Packages installed successfully!



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from typing import List, Dict, Any, Tuple
import warnings
warnings.filterwarnings('ignore')

# Import custom data loader
from data_loader import MathDatasetLoader

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
print("Libraries imported successfully!")


Libraries imported successfully!


In [18]:
# Initialize loader and load all datasets
loader = MathDatasetLoader()
datasets = loader.get_all_datasets()

# Show summary
for name, df in datasets.items():
    print(f'{name}: {df.shape[0]} samples, {df.shape[1]} features')
    print(f'Columns: {list(df.columns)}')
    print()


Loaded GSM8K train dataset from local file
Loaded GSM8K test dataset from local file
Loaded MathQA train dataset from local file
Loaded MathQA test dataset from local file
Loaded MAWPS train dataset from local file
Loaded MAWPS test dataset from local file
Custom dataset not found. Creating sample data...
gsm8k_train: 5 samples, 7 features
Columns: ['question', 'answer', 'solution', 'difficulty', 'category', 'dataset', 'split']

gsm8k_test: 5 samples, 7 features
Columns: ['question', 'answer', 'solution', 'difficulty', 'category', 'dataset', 'split']

mathqa_train: 5 samples, 7 features
Columns: ['Problem', 'Rationale', 'correct', 'options', 'category', 'dataset', 'split']

mathqa_test: 5 samples, 7 features
Columns: ['Problem', 'Rationale', 'correct', 'options', 'category', 'dataset', 'split']

mawps_train: 5 samples, 7 features
Columns: ['sQuestion', 'lSolutions', 'lEquations', 'iIndex', 'category', 'dataset', 'split']

mawps_test: 5 samples, 7 features
Columns: ['sQuestion', 'lSolut

In [19]:
def validate_dataset(df: pd.DataFrame) -> Dict[str, Any]:
    """Validate dataset with safe handling of list columns."""
    result = {}
    result['missing_values'] = df.isnull().sum().to_dict()
    
    # Handle duplicates more safely - exclude list columns
    try:
        # Find columns that are safe to check for duplicates
        safe_columns = []
        for col in df.columns:
            if df[col].dtype in ['object']:
                # Check if this column contains lists by sampling
                sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
                if not isinstance(sample_val, (list, dict)):
                    safe_columns.append(col)
            else:
                safe_columns.append(col)
        
        if safe_columns:
            result['duplicates'] = df[safe_columns].duplicated().sum()
            result['duplicate_check_columns'] = safe_columns
        else:
            result['duplicates'] = "Cannot check - all columns contain complex objects"
            result['duplicate_check_columns'] = []
    except Exception as e:
        result['duplicates'] = f"Error checking duplicates: {str(e)}"
        result['duplicate_check_columns'] = []
    
    result['data_types'] = df.dtypes.apply(str).to_dict()
    result['shape'] = df.shape
    return result

# Validate all datasets
print("Validating datasets...")
validation_results = {name: validate_dataset(df) for name, df in datasets.items()}

import pprint
pprint.pprint(validation_results)


Validating datasets...
{'custom': {'data_types': {'dataset': 'object',
                           'difficulty_level': 'object',
                           'final_answer': 'object',
                           'problem_id': 'object',
                           'problem_text': 'object',
                           'solution_steps': 'object',
                           'step_count': 'int64',
                           'subject': 'object',
                           'symbolic_complexity': 'int64'},
            'duplicate_check_columns': ['problem_id',
                                        'problem_text',
                                        'difficulty_level',
                                        'subject',
                                        'final_answer',
                                        'symbolic_complexity',
                                        'step_count',
                                        'dataset'],
            'duplicates': 0,
            'missing_values

In [None]:
# FIXED VERSION - Copy this into your notebook cell
def validate_dataset_fixed(df: pd.DataFrame) -> Dict[str, Any]:
    """Validate dataset with safe handling of list columns."""
    result = {}
    result['missing_values'] = df.isnull().sum().to_dict()
    
    # Handle duplicates safely by converting lists to strings
    try:
        # Create a copy for duplicate checking
        df_check = df.copy()
        
        # Convert list columns to strings for duplicate checking
        for col in df_check.columns:
            if df_check[col].dtype == 'object':
                # Check if column contains lists
                sample_val = df_check[col].dropna().iloc[0] if not df_check[col].dropna().empty else None
                if isinstance(sample_val, list):
                    df_check[col] = df_check[col].astype(str)
        
        result['duplicates'] = df_check.duplicated().sum()
    except Exception as e:
        result['duplicates'] = f"Error: {str(e)}"
    
    result['data_types'] = df.dtypes.apply(str).to_dict()
    result['shape'] = df.shape
    return result

# Use the fixed function
print("Validating datasets with fixed function...")
validation_results = {name: validate_dataset_fixed(df) for name, df in datasets.items()}

import pprint
pprint.pprint(validation_results)


In [None]:
# FIXED VERSION - Safe data cleaning for datasets with list columns
df = datasets['custom'].copy()

# Clean text fields first
def clean_text(text):
    if pd.isna(text): return ''
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text

df['problem_text'] = df['problem_text'].apply(clean_text)

# Safe duplicate removal - only check specific columns that don't contain lists
def safe_drop_duplicates(df):
    """Safely remove duplicates by only checking non-list columns."""
    try:
        # Identify columns that are safe to check for duplicates
        safe_columns = []
        for col in df.columns:
            if df[col].dtype == 'object':
                # Check if column contains lists
                sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
                if not isinstance(sample_val, list):
                    safe_columns.append(col)
            else:
                safe_columns.append(col)
        
        print(f"Checking duplicates on columns: {safe_columns}")
        
        if safe_columns:
            # Only check duplicates on safe columns
            df_clean = df.drop_duplicates(subset=safe_columns)
        else:
            print("No safe columns for duplicate checking - skipping duplicate removal")
            df_clean = df.copy()
        
        return df_clean
    except Exception as e:
        print(f"Error in duplicate removal: {e}")
        return df

# Apply safe duplicate removal
df = safe_drop_duplicates(df)

# Remove completely empty rows
df = df.dropna(how='all')

print(f"Dataset shape after cleaning: {df.shape}")
print(df.head())


In [20]:
# Install required packages using magic commands
import sys
!{sys.executable} -m pip install pandas>=1.5.0 numpy>=1.21.0 matplotlib>=3.5.0 seaborn>=0.11.0 scikit-learn>=1.1.0 --quiet

# Alternative magic command method
# %pip install pandas numpy matplotlib seaborn scikit-learn --quiet

print("✅ Packages installed successfully!")


✅ Packages installed successfully!



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from typing import List, Dict, Any, Tuple
import warnings
warnings.filterwarnings('ignore')

# Import custom data loader
from data_loader import MathDatasetLoader

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
print("Libraries imported successfully!")


Libraries imported successfully!


In [22]:
# Initialize loader and load all datasets
loader = MathDatasetLoader()
datasets = loader.get_all_datasets()

# Show summary
for name, df in datasets.items():
    print(f'{name}: {df.shape[0]} samples, {df.shape[1]} features')
    print(f'Columns: {list(df.columns)}')
    print()


Loaded GSM8K train dataset from local file
Loaded GSM8K test dataset from local file
Loaded MathQA train dataset from local file
Loaded MathQA test dataset from local file
Loaded MAWPS train dataset from local file
Loaded MAWPS test dataset from local file
Custom dataset not found. Creating sample data...
gsm8k_train: 5 samples, 7 features
Columns: ['question', 'answer', 'solution', 'difficulty', 'category', 'dataset', 'split']

gsm8k_test: 5 samples, 7 features
Columns: ['question', 'answer', 'solution', 'difficulty', 'category', 'dataset', 'split']

mathqa_train: 5 samples, 7 features
Columns: ['Problem', 'Rationale', 'correct', 'options', 'category', 'dataset', 'split']

mathqa_test: 5 samples, 7 features
Columns: ['Problem', 'Rationale', 'correct', 'options', 'category', 'dataset', 'split']

mawps_train: 5 samples, 7 features
Columns: ['sQuestion', 'lSolutions', 'lEquations', 'iIndex', 'category', 'dataset', 'split']

mawps_test: 5 samples, 7 features
Columns: ['sQuestion', 'lSolut

In [24]:
def validate_dataset_fixed(df):
    result = {}
    result['missing_values'] = df.isnull().sum().to_dict()
    
    # Handle duplicates safely by converting lists to strings
    try:
        df_check = df.copy()
        for col in df_check.columns:
            if df_check[col].dtype == 'object':
                sample_val = df_check[col].dropna().iloc[0] if not df_check[col].dropna().empty else None
                if isinstance(sample_val, list):
                    df_check[col] = df_check[col].astype(str)
        result['duplicates'] = df_check.duplicated().sum()
    except Exception as e:
        result['duplicates'] = f"Error: {str(e)}"
    
    result['data_types'] = df.dtypes.apply(str).to_dict()
    result['shape'] = df.shape
    return result

# Use the fixed function
validation_results = {name: validate_dataset_fixed(df) for name, df in datasets.items()}
import pprint
pprint.pprint(validation_results)

{'custom': {'data_types': {'dataset': 'object',
                           'difficulty_level': 'object',
                           'final_answer': 'object',
                           'problem_id': 'object',
                           'problem_text': 'object',
                           'solution_steps': 'object',
                           'step_count': 'int64',
                           'subject': 'object',
                           'symbolic_complexity': 'int64'},
            'duplicates': 0,
            'missing_values': {'dataset': 0,
                               'difficulty_level': 0,
                               'final_answer': 0,
                               'problem_id': 0,
                               'problem_text': 0,
                               'solution_steps': 0,
                               'step_count': 0,
                               'subject': 0,
                               'symbolic_complexity': 0},
            'shape': (5, 9)},
 'gsm8k_test': {

In [None]:
# FIXED VERSION - Safe data cleaning for datasets with list columns
df = datasets['custom'].copy()

# Clean text fields first
def clean_text(text):
    if pd.isna(text): return ''
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text

df['problem_text'] = df['problem_text'].apply(clean_text)

# Safe duplicate removal - only check specific columns that don't contain lists
def safe_drop_duplicates(df):
    """Safely remove duplicates by only checking non-list columns."""
    try:
        # Identify columns that are safe to check for duplicates
        safe_columns = []
        for col in df.columns:
            if df[col].dtype == 'object':
                # Check if column contains lists
                sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
                if not isinstance(sample_val, list):
                    safe_columns.append(col)
            else:
                safe_columns.append(col)
        
        print(f"Checking duplicates on columns: {safe_columns}")
        
        if safe_columns:
            # Only check duplicates on safe columns
            df_clean = df.drop_duplicates(subset=safe_columns)
        else:
            print("No safe columns for duplicate checking - skipping duplicate removal")
            df_clean = df.copy()
        
        return df_clean
    except Exception as e:
        print(f"Error in duplicate removal: {e}")
        return df

# Apply safe duplicate removal
df = safe_drop_duplicates(df)

# Remove completely empty rows
df = df.dropna(how='all')

print(f"Dataset shape after cleaning: {df.shape}")
print(df.head())

TypeError: unhashable type: 'list'

In [None]:
# Example: Extract text length and operator count
def extract_features(text):
    features = {}
    features['text_length'] = len(text)
    features['operator_count'] = sum(text.count(op) for op in ['+', '-', '*', '/', '=', '^'])
    return features

features_df = df['problem_text'].apply(extract_features).apply(pd.Series)
df = pd.concat([df, features_df], axis=1)

print(df.head())


In [None]:
# Save cleaned and feature-engineered data
df.to_csv('preprocessed_custom_math_data.csv', index=False)
print("Saved preprocessed data to 'preprocessed_custom_math_data.csv'")
