# Sales Data Analysis Test Notebook

This notebook demonstrates a simple data science workflow for testing the context retrieval persona.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Create sample sales data
np.random.seed(42)
n_samples = 1000

data = {
    'advertising_spend': np.random.uniform(1000, 50000, n_samples),
    'sales_team_size': np.random.randint(5, 50, n_samples),
    'market_size': np.random.uniform(100000, 1000000, n_samples),
    'season': np.random.choice(['Q1', 'Q2', 'Q3', 'Q4'], n_samples)
}

# Generate revenue with some realistic relationships
data['revenue'] = (
    data['advertising_spend'] * 2.5 + 
    data['sales_team_size'] * 1000 + 
    data['market_size'] * 0.1 +
    np.random.normal(0, 10000, n_samples)
)

df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Prepare data for modeling
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=['season'], prefix='season')

# Define features and target
feature_columns = [col for col in df_encoded.columns if col != 'revenue']
X = df_encoded[feature_columns]
y = df_encoded['revenue']

print(f"Features: {X.columns.tolist()}")
print(f"Target: revenue")
print(f"Feature matrix shape: {X.shape}")

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Train a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Model Performance:")
print(f"Training MSE: {train_mse:,.2f}")
print(f"Test MSE: {test_mse:,.2f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")