# Data Analysis Session
This notebook demonstrates real-time monitoring of Cursor PKL Extension


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("Starting data analysis session...")


## User: I need to analyze this dataset and create visualizations

Can you help me explore the data structure and identify patterns?


In [None]:
# Load the dataset
data = pd.DataFrame({
    'x': np.random.randn(100),
    'y': np.random.randn(100),
    'value': np.random.randn(100)
})

print("Dataset shape:", data.shape)
print("Data types:")
print(data.dtypes)
print("\nBasic statistics:")
print(data.describe())


## Assistant: I'll help you analyze the dataset!

Let me start by examining the data structure and creating some exploratory visualizations. First, let's look at the basic statistics and data types.


In [None]:
# Create comprehensive visualization suite with grayscale palette
plt.figure(figsize=(12, 8))

# Scatter plot
plt.subplot(3, 2, 1)
plt.scatter(data['x'], data['y'], c=data['value'], cmap='gray', alpha=0.7)
plt.xlabel('X values')
plt.ylabel('Y values')
plt.title('Scatter Plot: X vs Y')
plt.colorbar(label='Value')

# Histogram
plt.subplot(3, 2, 2)
plt.hist(data['x'], bins=20, alpha=0.7, color='gray')
plt.xlabel('X values')
plt.ylabel('Frequency')
plt.title('Distribution of X values')

# Time series (simulated)
plt.subplot(3, 2, 3)
time_series = np.cumsum(np.random.randn(100))
plt.plot(time_series, color='black', linewidth=1.5)
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Time Series')

# Correlation heatmap
plt.subplot(3, 2, 5)
correlation_matrix = data.corr()
plt.imshow(correlation_matrix, cmap='gray', aspect='auto')
plt.colorbar()
plt.title('Correlation Matrix')

plt.tight_layout()
plt.show()


## User: Can you add some advanced visualizations for this data?

I'd like to see some more sophisticated plots that show the relationships between variables and help identify patterns.


In [None]:
# Advanced visualizations for deeper data insights
import seaborn as sns
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set style for better-looking plots with grayscale palette
plt.style.use('seaborn-v0_8')
sns.set_palette("gray")  # Use grayscale palette

print("Creating advanced visualizations with grayscale palette...")


In [None]:
# 1. Pairwise relationships with regression lines (grayscale)
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Scatter plot with regression line
sns.scatterplot(data=data, x='x', y='y', hue='value', ax=axes[0,0], palette='gray')
sns.regplot(data=data, x='x', y='y', ax=axes[0,0], scatter=False, color='black')
axes[0,0].set_title('X vs Y with Regression Line')

# Distribution plots
sns.histplot(data=data, x='x', kde=True, ax=axes[0,1], color='gray', alpha=0.7)
axes[0,1].set_title('Distribution of X values')

sns.histplot(data=data, x='y', kde=True, ax=axes[1,0], color='gray', alpha=0.7)
axes[1,0].set_title('Distribution of Y values')

# Box plot for value distribution
sns.boxplot(data=data, y='value', ax=axes[1,1], color='gray')
axes[1,1].set_title('Value Distribution')

plt.tight_layout()
plt.show()


In [None]:
# 2. Interactive 3D scatter plot with Plotly (grayscale)
fig_3d = px.scatter_3d(data, x='x', y='y', z='value', 
                       color='value', 
                       title='3D Interactive Scatter Plot (Grayscale)',
                       labels={'x': 'X Values', 'y': 'Y Values', 'value': 'Z Values'},
                       color_continuous_scale='gray')

# Add trend surface
fig_3d.update_layout(
    scene=dict(
        xaxis_title='X Values',
        yaxis_title='Y Values', 
        zaxis_title='Value'
    ),
    width=800,
    height=600
)

fig_3d.show()


In [None]:
# 3. Statistical analysis and correlation heatmap (grayscale)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Correlation heatmap
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='gray', center=0, ax=axes[0,0])
axes[0,0].set_title('Correlation Heatmap')

# Q-Q plots for normality testing
stats.probplot(data['x'], dist="norm", plot=axes[0,1])
axes[0,1].get_lines()[0].set_color('black')  # Set line color to black
axes[0,1].get_lines()[1].set_color('gray')   # Set reference line to gray
axes[0,1].set_title('Q-Q Plot: X values')

stats.probplot(data['y'], dist="norm", plot=axes[1,0])
axes[1,0].get_lines()[0].set_color('black')  # Set line color to black
axes[1,0].get_lines()[1].set_color('gray')   # Set reference line to gray
axes[1,0].set_title('Q-Q Plot: Y values')

# Violin plot for distribution comparison
data_melted = data.melt(value_vars=['x', 'y', 'value'])
sns.violinplot(data=data_melted, x='variable', y='value', ax=axes[1,1], color='gray')
axes[1,1].set_title('Distribution Comparison')

plt.tight_layout()
plt.show()


In [None]:
# 4. Advanced statistical summary
print("=== ADVANCED STATISTICAL ANALYSIS ===")
print(f"Dataset shape: {data.shape}")
print(f"Memory usage: {data.memory_usage(deep=True).sum() / 1024:.2f} KB")
print()

# Descriptive statistics
print("Descriptive Statistics:")
print(data.describe())
print()

# Normality tests
from scipy.stats import shapiro, normaltest

print("Normality Tests:")
for col in ['x', 'y', 'value']:
    stat, p_value = shapiro(data[col])
    print(f"{col}: Shapiro-Wilk test - statistic={stat:.4f}, p-value={p_value:.4f}")
    if p_value > 0.05:
        print(f"  → {col} appears to be normally distributed")
    else:
        print(f"  → {col} does not appear to be normally distributed")
print()

# Correlation analysis
print("Correlation Analysis:")
corr_matrix = data.corr()
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
        corr_val = corr_matrix.loc[col1, col2]
        print(f"{col1} vs {col2}: {corr_val:.4f}")
print()

# Outlier detection using IQR method
print("Outlier Detection (IQR method):")
for col in ['x', 'y', 'value']:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    print(f"{col}: {len(outliers)} outliers out of {len(data)} points ({len(outliers)/len(data)*100:.1f}%)")


## Assistant: I've added comprehensive visualizations for your data!

Here's what I've created:

1. **Pairwise Relationships**: Scatter plots with regression lines and distribution plots
2. **Interactive 3D Visualization**: 3D scatter plot using Plotly for exploring data in three dimensions  
3. **Statistical Analysis**: Correlation heatmaps, Q-Q plots for normality testing, and violin plots
4. **Advanced Statistics**: Comprehensive statistical summary including normality tests and outlier detection

These visualizations will help you:
- Identify relationships between variables
- Check data distribution and normality
- Detect outliers and anomalies
- Understand correlation patterns
- Explore data interactively in 3D space


/

e I