# Test Notebook for PKL Extension

This notebook demonstrates data science workflows that the PKL Extension should detect.


In [1]:
# Cell 1: Data Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("Libraries imported successfully!")


Libraries imported successfully!


In [None]:
# Cell 2: Create sample data
np.random.seed(42)
data = pd.DataFrame({
    'x': np.random.randn(100),
    'y': np.random.randn(100) + 0.5 * np.random.randn(100),
    'category': np.random.choice(['A', 'B', 'C'], 100),
    'value': np.random.exponential(2, 100),
    'timestamp': pd.date_range('2024-01-01', periods=100, freq='D')
})

print(f"Dataset created with shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
print(f"Data types:\n{data.dtypes}")
data.head()


In [None]:
# Cell 3: Data Exploration and Analysis
print("=== Data Exploration ===")
print(f"Missing values:\n{data.isnull().sum()}")
print(f"\nBasic statistics:")
print(data.describe())

# Correlation analysis
correlation_matrix = data[['x', 'y', 'value']].corr()
print(f"\nCorrelation matrix:")
print(correlation_matrix)


In [None]:
# Cell 4: Data Visualization
plt.figure(figsize=(12, 8))

# Scatter plot
plt.subplot(2, 2, 1)
plt.scatter(data['x'], data['y'], c=data['value'], cmap='viridis', alpha=0.7)
plt.xlabel('X values')
plt.ylabel('Y values')
plt.title('Scatter Plot: X vs Y')
plt.colorbar(label='Value')

# Histogram
plt.subplot(2, 2, 2)
plt.hist(data['value'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of Values')

# Category distribution
plt.subplot(2, 2, 3)
category_counts = data['category'].value_counts()
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Category Distribution')

# Time series
plt.subplot(2, 2, 4)
plt.plot(data['timestamp'], data['value'], marker='o', markersize=3)
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Time Series of Values')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("Visualizations completed!")


In [None]:
# Cell 5: Machine Learning - Simple Linear Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Prepare features and target
X = data[['x', 'y']].values
y = data['value'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")
print(f"Model coefficients: {model.coef_}")
print(f"Model intercept: {model.intercept_:.4f}")


In [None]:
# Cell 6: Model Building
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

print("=== MACHINE LEARNING MODEL ===")

# Prepare features
X = data[['x', 'y']].values
y = data['value'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_test = rf_model.predict(X_test)

print(f"Model trained successfully!")
print(f"Test predictions shape: {y_pred_test.shape}")

print("\nModel training complete!")


## Summary

This notebook demonstrates a complete data science workflow:

1. **Data Loading**: Imported necessary libraries
2. **Data Creation**: Generated synthetic dataset with multiple features
3. **Data Exploration**: Analyzed data structure, missing values, and outliers
4. **Data Visualization**: Created comprehensive plots for analysis
5. **Statistical Analysis**: Performed correlation analysis and hypothesis testing
6. **Machine Learning**: Built and trained a predictive model

The PKL Extension should detect this as an **EXPLORE** intent with **SUCCESS** outcome, as it follows a typical data science exploration pattern.


In [None]:
# Cell 8: Export Results and Save Data
# Save processed data to CSV
output_file = 'processed_data.csv'
data.to_csv(output_file, index=False)
print(f"Data saved to {output_file}")

# Create a summary report
summary_report = {
    'total_records': len(data),
    'columns': list(data.columns),
    'missing_values': data.isnull().sum().to_dict(),
    'data_types': data.dtypes.to_dict(),
    'correlation_matrix': data[['x', 'y', 'value']].corr().to_dict(),
    'model_performance': {
        'mse': float(mse),
        'r2_score': float(r2)
    }
}

# Save summary as JSON
import json
with open('analysis_summary.json', 'w') as f:
    json.dump(summary_report, f, indent=2, default=str)

print("Analysis summary saved to analysis_summary.json")
print(f"Final dataset shape: {data.shape}")
print("Notebook execution completed successfully!")


In [None]:
# Cell 7: Advanced Analysis - Statistical Tests
from scipy import stats

# Perform statistical tests
print("=== Statistical Analysis ===")

# Normality test for value column
shapiro_stat, shapiro_p = stats.shapiro(data['value'])
print(f"Shapiro-Wilk normality test for 'value':")
print(f"  Statistic: {shapiro_stat:.4f}, p-value: {shapiro_p:.4f}")

# Correlation test between x and y
corr_coef, corr_p = stats.pearsonr(data['x'], data['y'])
print(f"\nPearson correlation between x and y:")
print(f"  Correlation coefficient: {corr_coef:.4f}, p-value: {corr_p:.4f}")

# ANOVA test across categories
category_groups = [group['value'].values for name, group in data.groupby('category')]
f_stat, f_p = stats.f_oneway(*category_groups)
print(f"\nANOVA test across categories:")
print(f"  F-statistic: {f_stat:.4f}, p-value: {f_p:.4f}")

# Summary statistics by category
print(f"\nSummary statistics by category:")
category_stats = data.groupby('category')['value'].agg(['count', 'mean', 'std', 'min', 'max'])
print(category_stats)


In [None]:
# Cell 6: Data Processing and Feature Engineering
# Create new features
data['x_squared'] = data['x'] ** 2
data['y_squared'] = data['y'] ** 2
data['xy_interaction'] = data['x'] * data['y']
data['value_log'] = np.log(data['value'] + 1)  # Add 1 to avoid log(0)

# Create time-based features
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['month'] = data['timestamp'].dt.month
data['quarter'] = data['timestamp'].dt.quarter

# Create categorical encoding
data['category_encoded'] = pd.Categorical(data['category']).codes

print("Feature Engineering completed!")
print(f"New dataset shape: {data.shape}")
print(f"New columns: {list(data.columns)}")

# Show sample of processed data
print("\nSample of processed data:")
data[['x', 'y', 'value', 'x_squared', 'xy_interaction', 'day_of_week', 'category_encoded']].head()


In [None]:
# Cell 5: Statistical Analysis
from scipy import stats

print("=== STATISTICAL ANALYSIS ===")

# Correlation analysis
correlation_matrix = data[['x', 'y', 'value']].corr()
print("Correlation Matrix:")
print(correlation_matrix)

# T-test between categories
category_a = data[data['category'] == 'A']['value']
category_b = data[data['category'] == 'B']['value']
category_c = data[data['category'] == 'C']['value']

print(f"\nT-test results:")
print(f"A vs B: t-statistic = {stats.ttest_ind(category_a, category_b)[0]:.4f}, p-value = {stats.ttest_ind(category_a, category_b)[1]:.4f}")
print(f"A vs C: t-statistic = {stats.ttest_ind(category_a, category_c)[0]:.4f}, p-value = {stats.ttest_ind(category_a, category_c)[1]:.4f}")

# Linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X = data[['x', 'y']].values
y = data['value'].values

model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)

print(f"\nLinear Regression Results:")
print(f"R-squared: {r2:.4f}")
print(f"Coefficients: x={model.coef_[0]:.4f}, y={model.coef_[1]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")


In [None]:
# Cell 4: Data Visualization
plt.figure(figsize=(12, 8))

# Subplot 1: Scatter plot
plt.subplot(2, 2, 1)
plt.scatter(data['x'], data['y'], c=data['value'], cmap='viridis', alpha=0.6)
plt.xlabel('X values')
plt.ylabel('Y values')
plt.title('X vs Y colored by Value')
plt.colorbar()

# Subplot 2: Distribution of values
plt.subplot(2, 2, 2)
plt.hist(data['value'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of Values')

# Subplot 3: Category counts
plt.subplot(2, 2, 3)
data['category'].value_counts().plot(kind='bar', color='lightcoral')
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Category Distribution')
plt.xticks(rotation=45)

# Subplot 4: Time series
plt.subplot(2, 2, 4)
data.set_index('timestamp')['value'].plot(alpha=0.7)
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Value Over Time')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("Visualization complete!")


In [None]:
# Cell 3: Data Exploration
print("=== DATA EXPLORATION ===")
print(f"Dataset shape: {data.shape}")
print(f"Missing values:\n{data.isnull().sum()}")
print(f"\nBasic statistics:")
print(data.describe())

# Check for outliers
Q1 = data['value'].quantile(0.25)
Q3 = data['value'].quantile(0.75)
IQR = Q3 - Q1
outliers = data[(data['value'] < Q1 - 1.5*IQR) | (data['value'] > Q3 + 1.5*IQR)]
print(f"\nOutliers detected: {len(outliers)} rows")
