# C. acnes Promoter Data Exploration

This notebook helps you explore your promoter sequence data and understand their characteristics.

In [None]:
# Import required libraries
import sys
sys.path.append('../scripts')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from utils import load_fasta_sequences, extract_promoter_features, analyze_promoter_dataset

## 1. Load Your Data

Replace the file path below with the path to your FASTA file containing C. acnes promoter sequences.

In [None]:
# Load your promoter sequences
fasta_file = '../data/c_acnes_promoters.fasta'  # Update this path

# Check if file exists
try:
    sequences = load_fasta_sequences(fasta_file)
    print(f"Loaded {len(sequences)} sequences")
    
    # Show first few sequence IDs
    for i, seq_id in enumerate(list(sequences.keys())[:5]):
        print(f"  {i+1}. {seq_id}")
        
except FileNotFoundError:
    print(f"File not found: {fasta_file}")
    print("Please add your FASTA file to the data directory")

## 2. Analyze Sequence Features

In [None]:
# Analyze all sequences if data is loaded
if 'sequences' in locals():
    df = analyze_promoter_dataset(fasta_file)
    
    print("Dataset overview:")
    print(f"Number of sequences: {len(df)}")
    print(f"Average sequence length: {df['length'].mean():.1f} bp")
    print(f"Average GC content: {df['gc_content'].mean():.3f}")
    print(f"Sequences with -10 boxes: {(df['minus_10_score'] > 0).sum()}")
    print(f"Sequences with -35 boxes: {(df['minus_35_score'] > 0).sum()}")
    
    # Display first few rows
    display(df[['sequence_id', 'length', 'gc_content', 'minus_10_score', 'minus_35_score']].head())

## 3. Visualize Feature Distributions

In [None]:
if 'df' in locals():
    # Set up plotting
    plt.style.use('default')
    sns.set_palette("husl")
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # GC content distribution
    axes[0, 0].hist(df['gc_content'], bins=20, alpha=0.7, color='skyblue')
    axes[0, 0].set_title('GC Content Distribution')
    axes[0, 0].set_xlabel('GC Content')
    axes[0, 0].set_ylabel('Frequency')
    
    # -10 box consensus scores
    axes[0, 1].hist(df['minus_10_score'], bins=20, alpha=0.7, color='lightcoral')
    axes[0, 1].set_title('-10 Box Consensus Scores')
    axes[0, 1].set_xlabel('Consensus Score')
    axes[0, 1].set_ylabel('Frequency')
    
    # -35 box consensus scores
    axes[1, 0].hist(df['minus_35_score'], bins=20, alpha=0.7, color='lightgreen')
    axes[1, 0].set_title('-35 Box Consensus Scores')
    axes[1, 0].set_xlabel('Consensus Score')
    axes[1, 0].set_ylabel('Frequency')
    
    # Spacer length distribution
    valid_spacers = df[df['spacer_length'] > 0]['spacer_length']
    if len(valid_spacers) > 0:
        axes[1, 1].hist(valid_spacers, bins=range(10, 25), alpha=0.7, color='gold')
        axes[1, 1].set_title('Spacer Length Distribution')
        axes[1, 1].set_xlabel('Spacer Length (bp)')
        axes[1, 1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## 4. Examine Top Scoring Promoters

In [None]:
if 'df' in locals():
    # Find promoters with best -10 and -35 boxes
    best_minus_10 = df.nlargest(5, 'minus_10_score')
    best_minus_35 = df.nlargest(5, 'minus_35_score')
    
    print("Top 5 promoters by -10 box score:")
    for idx, row in best_minus_10.iterrows():
        print(f"  {row['sequence_id']}: {row['minus_10_sequence']} (score: {row['minus_10_score']:.3f})")
    
    print("\nTop 5 promoters by -35 box score:")
    for idx, row in best_minus_35.iterrows():
        if row['minus_35_sequence']:
            print(f"  {row['sequence_id']}: {row['minus_35_sequence']} (score: {row['minus_35_score']:.3f})")

## 5. Analyze Your P3 Promoter

Compare your weak P3 promoter from S. aureus with the C. acnes promoters.

In [None]:
# Add your P3 promoter sequence here
p3_sequence = "YOUR_P3_PROMOTER_SEQUENCE_HERE"  # Replace with actual sequence

if p3_sequence != "YOUR_P3_PROMOTER_SEQUENCE_HERE":
    # Analyze P3 promoter
    p3_features = extract_promoter_features(p3_sequence)
    
    print("P3 Promoter Analysis:")
    print(f"  Sequence: {p3_sequence}")
    print(f"  Length: {p3_features['length']} bp")
    print(f"  GC content: {p3_features['gc_content']:.3f}")
    print(f"  -10 box: {p3_features['minus_10_sequence']} (score: {p3_features['minus_10_score']:.3f})")
    print(f"  -35 box: {p3_features['minus_35_sequence']} (score: {p3_features['minus_35_score']:.3f})")
    print(f"  Spacer length: {p3_features['spacer_length']} bp")
    
    # Compare with C. acnes promoters
    if 'df' in locals():
        print("\nComparison with C. acnes promoters:")
        print(f"  P3 -10 score vs average: {p3_features['minus_10_score']:.3f} vs {df['minus_10_score'].mean():.3f}")
        print(f"  P3 -35 score vs average: {p3_features['minus_35_score']:.3f} vs {df['minus_35_score'].mean():.3f}")
        print(f"  P3 GC content vs average: {p3_features['gc_content']:.3f} vs {df['gc_content'].mean():.3f}")
else:
    print("Please add your P3 promoter sequence to analyze it.")

## Next Steps

1. **Data Collection**: Make sure you have your C. acnes promoter sequences in FASTA format
2. **P3 Analysis**: Add your P3 promoter sequence for comparison
3. **Model Training**: Use notebook `03_model_training.ipynb` to train prediction models
4. **Optimization**: Use `04_promoter_optimization.ipynb` to design improved -10/-35 regions