# Azure GenAI Ops - Data Exploration

This notebook demonstrates data exploration and preprocessing for the GenAI pipeline.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up paths
project_root = Path().resolve().parent
data_dir = project_root / "data"

print(f"Project root: {project_root}")
print(f"Data directory: {data_dir}")

## Load Sample Data

In [None]:
# Load sample data
sample_data_path = data_dir / "raw" / "sample_data.csv"
df = pd.read_csv(sample_data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

## Data Analysis

In [None]:
# Basic statistics
print("Dataset Info:")
print(f"Total samples: {len(df)}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())

In [None]:
# Visualize label distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='label')
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Text length analysis
df['text_length'] = df['text'].str.len()

print("Text length statistics:")
print(df['text_length'].describe())

# Plot text length distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='text_length', hue='label', bins=20)
plt.title('Text Length Distribution by Label')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

## Data Preprocessing

In [None]:
# Import data processing functions
import sys
sys.path.append(str(project_root / "src"))

from src.data.make_dataset import DataProcessor

# Initialize processor
processor = DataProcessor()

# Preprocess text data
processed_df = processor.preprocess_text_data(df, "text")
print(f"Original samples: {len(df)}")
print(f"After preprocessing: {len(processed_df)}")

In [None]:
# Split data
data_splits = processor.split_data(processed_df, train_ratio=0.6, val_ratio=0.2)

print("Data splits:")
for split_name, split_df in data_splits.items():
    print(f"{split_name}: {len(split_df)} samples")
    print(f"  Label distribution: {split_df['label'].value_counts().to_dict()}")
    print()

## Azure OpenAI Integration Demo

In [None]:
# Note: This requires valid Azure OpenAI credentials
# Uncomment and run if you have proper configuration

# from src.models.azure_openai import GenAIModel, PromptTemplate

# # Initialize model (requires valid Azure credentials)
# try:
#     model = GenAIModel()
#     
#     # Example classification prompt
#     sample_text = "This is an amazing product!"
#     prompt = f"""
#     Classify the following text as positive, negative, or neutral:
#     
#     Text: {sample_text}
#     
#     Classification:
#     """
#     
#     result = model.predict(prompt)
#     print(f"Input: {sample_text}")
#     print(f"Classification: {result}")
#     
# except Exception as e:
#     print(f"Azure OpenAI not configured: {e}")
#     print("Please set up your Azure OpenAI credentials to test this functionality.")

print("Azure OpenAI integration available - see comments in cell above for demo.")

## Next Steps

1. **Data Collection**: Gather more training data for your specific use case
2. **Feature Engineering**: Create additional features if needed
3. **Model Training**: Use the training pipeline to train your GenAI model
4. **Evaluation**: Assess model performance using the evaluation framework
5. **Deployment**: Deploy your trained model using the CI/CD pipeline

See the project README and other notebooks for more detailed workflows.