## GPU Environment Verification

First, let's verify our GPU setup and PyTorch installation.

In [None]:
import torch
import platform

print(f"Python version: {platform.python_version()}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    
# Test GPU with a simple operation
if torch.cuda.is_available():
    # Create a tensor on GPU
    x = torch.randn(1000, 1000).cuda()
    # Perform a matrix multiplication
    y = torch.matmul(x, x)
    print("\nGPU test successful: Matrix multiplication completed")

# Getting Started with Genetics Analysis in Python

This notebook demonstrates basic DNA sequence analysis using Python and Biopython. We'll cover essential tools and techniques for genetic analysis.

## 1. Setup Python Environment for Genetics

First, we'll verify our Python environment and required packages are properly installed.

In [None]:
# Check Python version
import sys
print(f"Python version: {sys.version}")

# List installed packages
import pkg_resources
installed_packages = [f"{dist.key} {dist.version}" for dist in pkg_resources.working_set]
print("\nInstalled packages:")
for pkg in installed_packages:
    if any(name in pkg.lower() for name in ['bio', 'numpy', 'pandas', 'scipy']):
        print(pkg)

## 2. Import Bioinformatics Libraries

Now we'll import the essential libraries for genetic analysis.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import GC
import matplotlib.pyplot as plt

print("Libraries imported successfully!")

## 3. Basic DNA Sequence Functions

Let's create some basic functions for DNA sequence analysis.

In [None]:
def validate_dna(seq):
    """Validate if a sequence contains only valid DNA nucleotides."""
    valid_bases = set('ATCG')
    seq = seq.upper()
    return all(base in valid_bases for base in seq)

def calculate_gc_content(seq):
    """Calculate the GC content of a DNA sequence."""
    seq = seq.upper()
    gc_count = seq.count('G') + seq.count('C')
    total_length = len(seq)
    return (gc_count / total_length) * 100 if total_length > 0 else 0

def get_nucleotide_frequency(seq):
    """Calculate the frequency of each nucleotide in a sequence."""
    seq = seq.upper()
    return {
        'A': seq.count('A'),
        'T': seq.count('T'),
        'G': seq.count('G'),
        'C': seq.count('C')
    }

# Test the functions
test_seq = "ATGCTAGCTAGCTGATCG"
print(f"Test sequence: {test_seq}")
print(f"Is valid DNA? {validate_dna(test_seq)}")
print(f"GC content: {calculate_gc_content(test_seq):.2f}%")
print(f"Nucleotide frequency: {get_nucleotide_frequency(test_seq)}")

## 4. Working with FASTA Files

Now let's see how to work with FASTA files using Biopython's SeqIO module.

In [None]:
# Create a sample FASTA file
sample_sequences = [
    (">sequence1", "ATGCTAGCTAGCTGATCG"),
    (">sequence2", "GCTAGCTAGCTAGCTAGT")
]

# Write sequences to a FASTA file
with open('../data/sample.fasta', 'w') as f:
    for header, seq in sample_sequences:
        f.write(f"{header}\n{seq}\n")

# Read and analyze the FASTA file
sequences = []
for record in SeqIO.parse('../data/sample.fasta', 'fasta'):
    sequences.append({
        'id': record.id,
        'sequence': str(record.seq),
        'length': len(record.seq),
        'gc_content': calculate_gc_content(str(record.seq))
    })

# Display results as a DataFrame
df = pd.DataFrame(sequences)
print("\nSequence Analysis:")
print(df)

## 5. Sequence Analysis Tools

Let's create some additional sequence analysis tools.

In [None]:
# Pattern matching in sequences
def find_pattern(seq, pattern):
    """Find all occurrences of a pattern in a sequence."""
    seq = seq.upper()
    pattern = pattern.upper()
    positions = []
    for i in range(len(seq) - len(pattern) + 1):
        if seq[i:i+len(pattern)] == pattern:
            positions.append(i)
    return positions

# Analyze nucleotide distribution
def plot_nucleotide_distribution(seq):
    """Plot the distribution of nucleotides in a sequence."""
    freq = get_nucleotide_frequency(seq)
    plt.figure(figsize=(8, 6))
    plt.bar(freq.keys(), freq.values())
    plt.title('Nucleotide Distribution')
    plt.xlabel('Nucleotide')
    plt.ylabel('Frequency')
    plt.show()

# Test the functions
test_seq = "ATGCTAGCTAGCTGATCG"
pattern = "TAG"
print(f"Pattern '{pattern}' found at positions: {find_pattern(test_seq, pattern)}")
plot_nucleotide_distribution(test_seq)