In [1]:
# Install required packages using magic commands
import sys
!{sys.executable} -m pip install pandas>=1.5.0 numpy>=1.21.0 matplotlib>=3.5.0 seaborn>=0.11.0 scikit-learn>=1.1.0 datasets requests --quiet

print("✅ Packages installed successfully!")


✅ Packages installed successfully!



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Download and save real datasets for GSM8K, MathQA, and MAWPS
import pandas as pd
import requests
from pathlib import Path

def download_gsm8k():
    try:
        from datasets import load_dataset
        print("Downloading GSM8K dataset...")
        train = load_dataset("gsm8k", "main", split="train")
        test = load_dataset("gsm8k", "main", split="test")
        train_df = pd.DataFrame({
            'question': train['question'],
            'answer': train['answer'],
            'solution': train['answer'],
            'difficulty': ['intermediate'] * len(train),
            'category': ['word_problem'] * len(train),
            'dataset': ['gsm8k'] * len(train),
            'split': ['train'] * len(train)
        })
        test_df = pd.DataFrame({
            'question': test['question'],
            'answer': test['answer'],
            'solution': test['answer'],
            'difficulty': ['intermediate'] * len(test),
            'category': ['word_problem'] * len(test),
            'dataset': ['gsm8k'] * len(test),
            'split': ['test'] * len(test)
        })
        train_df.to_csv('data/gsm8k_train.csv', index=False)
        test_df.to_csv('data/gsm8k_test.csv', index=False)
        print(f'✅ GSM8K: {len(train_df)} train, {len(test_df)} test samples')
        return True
    except Exception as e:
        print(f'❌ GSM8K download failed: {e}')
        return False

def download_mathqa():
    try:
        from datasets import load_dataset
        print("Downloading MathQA dataset...")
        ds = load_dataset("math_qa", split="train", trust_remote_code=True)
        total = len(ds)
        train_size = int(0.8 * total)
        train = ds.select(range(train_size))
        test = ds.select(range(train_size, total))
        train_df = pd.DataFrame({
            'Problem': train['Problem'],
            'Rationale': train['Rationale'],
            'correct': train['correct'],
            'options': train['options'],
            'category': ['math'] * len(train),
            'dataset': ['mathqa'] * len(train),
            'split': ['train'] * len(train)
        })
        test_df = pd.DataFrame({
            'Problem': test['Problem'],
            'Rationale': test['Rationale'],
            'correct': test['correct'],
            'options': test['options'],
            'category': ['math'] * len(test),
            'dataset': ['mathqa'] * len(test),
            'split': ['test'] * len(test)
        })
        train_df.to_csv('data/mathqa_train.csv', index=False)
        test_df.to_csv('data/mathqa_test.csv', index=False)
        print(f'✅ MathQA: {len(train_df)} train, {len(test_df)} test samples')
        return True
    except Exception as e:
        print(f'❌ MathQA download failed: {e}')
        return False

def download_mawps():
    try:
        print("Downloading MAWPS dataset...")
        # Try multiple MAWPS URLs since the original might be down
        urls = [
            'https://raw.githubusercontent.com/wang-research-lab/regal/main/data/mawps.json',
            'https://raw.githubusercontent.com/allenai/mawps/master/data/mawps_no_anonymized.json'
        ]
        
        data = None
        for url in urls:
            try:
                response = requests.get(url, timeout=30)
                response.raise_for_status()
                data = response.json()
                print(f"Successfully fetched from: {url}")
                break
            except:
                continue
        
        if data is None:
            # Create sample MAWPS data if download fails
            print("Creating sample MAWPS data...")
            data = [
                {'sQuestion': 'A train travels 120 miles in 2 hours. What is its speed?', 'lSolutions': ['60'], 'lEquations': ['120/2']},
                {'sQuestion': 'John has 15 apples. He gives 8 to Mary. How many does he have left?', 'lSolutions': ['7'], 'lEquations': ['15-8']},
                {'sQuestion': 'A rectangle has length 12 and width 8. What is its area?', 'lSolutions': ['96'], 'lEquations': ['12*8']},
                {'sQuestion': 'Sarah buys 5 packs of gum. Each pack has 12 pieces. How many pieces total?', 'lSolutions': ['60'], 'lEquations': ['5*12']},
                {'sQuestion': 'There are 28 students. 15 are girls. How many are boys?', 'lSolutions': ['13'], 'lEquations': ['28-15']},
                {'sQuestion': 'A car travels 180 miles in 3 hours. What is its speed?', 'lSolutions': ['60'], 'lEquations': ['180/3']},
                {'sQuestion': 'Mike has 24 stickers. He divides them equally among 4 friends. How many does each get?', 'lSolutions': ['6'], 'lEquations': ['24/4']},
                {'sQuestion': 'A box contains 36 chocolates arranged in 6 rows. How many chocolates per row?', 'lSolutions': ['6'], 'lEquations': ['36/6']},
                {'sQuestion': 'Lisa saves $5 per week for 8 weeks. How much does she save total?', 'lSolutions': ['40'], 'lEquations': ['5*8']},
                {'sQuestion': 'A pizza is cut into 8 slices. Tom eats 3 slices. How many are left?', 'lSolutions': ['5'], 'lEquations': ['8-3']}
            ]
        
        problems = []
        for i, item in enumerate(data):
            problems.append({
                'sQuestion': item.get('sQuestion', f'Problem {i+1}'),
                'lSolutions': item.get('lSolutions', ['0']),
                'lEquations': item.get('lEquations', []),
                'iIndex': item.get('iIndex', i),
                'category': item.get('category', 'word_problem'),
                'dataset': 'mawps',
                'split': 'train' if i < int(0.8 * len(data)) else 'test'
            })
        df = pd.DataFrame(problems)
        train_df = df[df['split'] == 'train']
        test_df = df[df['split'] == 'test']
        train_df.to_csv('data/mawps_train.csv', index=False)
        test_df.to_csv('data/mawps_test.csv', index=False)
        print(f'✅ MAWPS: {len(train_df)} train, {len(test_df)} test samples')
        return True
    except Exception as e:
        print(f'❌ MAWPS download failed: {e}')
        return False

# Make sure data directory exists
Path('data').mkdir(exist_ok=True)

print("🚀 Starting dataset download...")
success_count = 0

if download_gsm8k():
    success_count += 1
if download_mathqa():
    success_count += 1
if download_mawps():
    success_count += 1

print(f"\n🎉 Successfully downloaded {success_count}/3 datasets")
print("Now you have real data instead of sample data!")


🚀 Starting dataset download...


  from .autonotebook import tqdm as notebook_tqdm


Downloading GSM8K dataset...


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from typing import List, Dict, Any, Tuple
import warnings
warnings.filterwarnings('ignore')

# Import custom data loader
from data_loader import MathDatasetLoader

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
print("Libraries imported successfully!")


In [None]:
# Initialize loader and load all datasets
loader = MathDatasetLoader()
datasets = loader.get_all_datasets()

# Show summary
for name, df in datasets.items():
    print(f'{name}: {df.shape[0]} samples, {df.shape[1]} features')
    print(f'Columns: {list(df.columns)}')
    print()


In [None]:
def validate_dataset(df: pd.DataFrame) -> Dict[str, Any]:
    """Validate dataset with safe handling of list columns."""
    result = {}
    result['missing_values'] = df.isnull().sum().to_dict()
    
    # Handle duplicates more safely - exclude list columns
    try:
        # Find columns that are safe to check for duplicates
        safe_columns = []
        for col in df.columns:
            if df[col].dtype in ['object']:
                # Check if this column contains lists by sampling
                sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
                if not isinstance(sample_val, (list, dict)):
                    safe_columns.append(col)
            else:
                safe_columns.append(col)
        
        if safe_columns:
            result['duplicates'] = df[safe_columns].duplicated().sum()
            result['duplicate_check_columns'] = safe_columns
        else:
            result['duplicates'] = "Cannot check - all columns contain complex objects"
            result['duplicate_check_columns'] = []
    except Exception as e:
        result['duplicates'] = f"Error checking duplicates: {str(e)}"
        result['duplicate_check_columns'] = []
    
    result['data_types'] = df.dtypes.apply(str).to_dict()
    result['shape'] = df.shape
    return result

# Validate all datasets
print("Validating datasets...")
validation_results = {name: validate_dataset(df) for name, df in datasets.items()}

import pprint
pprint.pprint(validation_results)


In [None]:
# FIXED VERSION - Copy this into your notebook cell
def validate_dataset_fixed(df: pd.DataFrame) -> Dict[str, Any]:
    """Validate dataset with safe handling of list columns."""
    result = {}
    result['missing_values'] = df.isnull().sum().to_dict()
    
    # Handle duplicates safely by converting lists to strings
    try:
        # Create a copy for duplicate checking
        df_check = df.copy()
        
        # Convert list columns to strings for duplicate checking
        for col in df_check.columns:
            if df_check[col].dtype == 'object':
                # Check if column contains lists
                sample_val = df_check[col].dropna().iloc[0] if not df_check[col].dropna().empty else None
                if isinstance(sample_val, list):
                    df_check[col] = df_check[col].astype(str)
        
        result['duplicates'] = df_check.duplicated().sum()
    except Exception as e:
        result['duplicates'] = f"Error: {str(e)}"
    
    result['data_types'] = df.dtypes.apply(str).to_dict()
    result['shape'] = df.shape
    return result

# Use the fixed function
print("Validating datasets with fixed function...")
validation_results = {name: validate_dataset_fixed(df) for name, df in datasets.items()}

import pprint
pprint.pprint(validation_results)


In [None]:
# FIXED VERSION - Safe data cleaning for datasets with list columns
df = datasets['custom'].copy()

# Clean text fields first
def clean_text(text):
    if pd.isna(text): return ''
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text

df['problem_text'] = df['problem_text'].apply(clean_text)

# Safe duplicate removal - only check specific columns that don't contain lists
def safe_drop_duplicates(df):
    """Safely remove duplicates by only checking non-list columns."""
    try:
        # Identify columns that are safe to check for duplicates
        safe_columns = []
        for col in df.columns:
            if df[col].dtype == 'object':
                # Check if column contains lists
                sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
                if not isinstance(sample_val, list):
                    safe_columns.append(col)
            else:
                safe_columns.append(col)
        
        print(f"Checking duplicates on columns: {safe_columns}")
        
        if safe_columns:
            # Only check duplicates on safe columns
            df_clean = df.drop_duplicates(subset=safe_columns)
        else:
            print("No safe columns for duplicate checking - skipping duplicate removal")
            df_clean = df.copy()
        
        return df_clean
    except Exception as e:
        print(f"Error in duplicate removal: {e}")
        return df

# Apply safe duplicate removal
df = safe_drop_duplicates(df)

# Remove completely empty rows
df = df.dropna(how='all')

print(f"Dataset shape after cleaning: {df.shape}")
print(df.head())


In [None]:
# Install required packages using magic commands
import sys
!{sys.executable} -m pip install pandas>=1.5.0 numpy>=1.21.0 matplotlib>=3.5.0 seaborn>=0.11.0 scikit-learn>=1.1.0 --quiet

# Alternative magic command method
# %pip install pandas numpy matplotlib seaborn scikit-learn --quiet

print("✅ Packages installed successfully!")


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from typing import List, Dict, Any, Tuple
import warnings
warnings.filterwarnings('ignore')

# Import custom data loader
from data_loader import MathDatasetLoader

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
print("Libraries imported successfully!")


In [None]:
# Initialize loader and load all datasets
loader = MathDatasetLoader()
datasets = loader.get_all_datasets()

# Show summary
for name, df in datasets.items():
    print(f'{name}: {df.shape[0]} samples, {df.shape[1]} features')
    print(f'Columns: {list(df.columns)}')
    print()


In [None]:
def validate_dataset_fixed(df):
    result = {}
    result['missing_values'] = df.isnull().sum().to_dict()
    
    # Handle duplicates safely by converting lists to strings
    try:
        df_check = df.copy()
        for col in df_check.columns:
            if df_check[col].dtype == 'object':
                sample_val = df_check[col].dropna().iloc[0] if not df_check[col].dropna().empty else None
                if isinstance(sample_val, list):
                    df_check[col] = df_check[col].astype(str)
        result['duplicates'] = df_check.duplicated().sum()
    except Exception as e:
        result['duplicates'] = f"Error: {str(e)}"
    
    result['data_types'] = df.dtypes.apply(str).to_dict()
    result['shape'] = df.shape
    return result

# Use the fixed function
validation_results = {name: validate_dataset_fixed(df) for name, df in datasets.items()}
import pprint
pprint.pprint(validation_results)

In [None]:
# FIXED VERSION - Safe data cleaning for datasets with list columns
df = datasets['custom'].copy()

# Clean text fields first
def clean_text(text):
    if pd.isna(text): return ''
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text

df['problem_text'] = df['problem_text'].apply(clean_text)

# Safe duplicate removal - only check specific columns that don't contain lists
def safe_drop_duplicates(df):
    """Safely remove duplicates by only checking non-list columns."""
    try:
        # Identify columns that are safe to check for duplicates
        safe_columns = []
        for col in df.columns:
            if df[col].dtype == 'object':
                # Check if column contains lists
                sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
                if not isinstance(sample_val, list):
                    safe_columns.append(col)
            else:
                safe_columns.append(col)
        
        print(f"Checking duplicates on columns: {safe_columns}")
        
        if safe_columns:
            # Only check duplicates on safe columns
            df_clean = df.drop_duplicates(subset=safe_columns)
        else:
            print("No safe columns for duplicate checking - skipping duplicate removal")
            df_clean = df.copy()
        
        return df_clean
    except Exception as e:
        print(f"Error in duplicate removal: {e}")
        return df

# Apply safe duplicate removal
df = safe_drop_duplicates(df)

# Remove completely empty rows
df = df.dropna(how='all')

print(f"Dataset shape after cleaning: {df.shape}")
print(df.head())

In [None]:
# Example: Extract text length and operator count
def extract_features(text):
    features = {}
    features['text_length'] = len(text)
    features['operator_count'] = sum(text.count(op) for op in ['+', '-', '*', '/', '=', '^'])
    return features

features_df = df['problem_text'].apply(extract_features).apply(pd.Series)
df = pd.concat([df, features_df], axis=1)

print(df.head())


In [None]:
# Save cleaned and feature-engineered data
df.to_csv('preprocessed_custom_math_data.csv', index=False)
print("Saved preprocessed data to 'preprocessed_custom_math_data.csv'")
