# Lecture 3: Fundamentals of Python - SOLUTION

**Course:** Single-Cell Neurogenomics  
**Date:** December 12, 2025  

---

## Task 1: Data Types and Variables (15 points)

In [None]:
# Create basic variables
n_cells = 5000  # integer
expression_level = 3.14  # float
gene_name = "TP53"  # string
is_expressed = True  # boolean

print("Variable Types:")
print(f"n_cells: {type(n_cells)} = {n_cells}")
print(f"expression_level: {type(expression_level)} = {expression_level}")
print(f"gene_name: {type(gene_name)} = {gene_name}")
print(f"is_expressed: {type(is_expressed)} = {is_expressed}")
print()

# Create gene list
genes = ['TP53', 'BRCA1', 'EGFR', 'MYC', 'KRAS']
print(f"Genes: {genes}")
print(f"Type: {type(genes)}")
print()

# Create gene-chromosome dictionary
gene_chr = {
    'TP53': '17p13.1',
    'BRCA1': '17q21.31',
    'EGFR': '7p11.2',
    'MYC': '8q24.21',
    'KRAS': '12p12.1'
}
print("Gene Chromosomal Locations:")
for gene, loc in gene_chr.items():
    print(f"  {gene}: {loc}")
print()

# Type conversion and f-strings
count_str = "1500"
count_int = int(count_str)
count_float = float(count_str)
print(f"String '{count_str}' → int {count_int} → float {count_float}")
print(f"The gene {genes[0]} is located on chromosome {gene_chr[genes[0]]}")

---

## Task 2: Lists and List Operations (20 points)

In [None]:
# Create expression list
expression = [2.5, 3.1, 1.8, 4.2, 2.9, 3.7, 1.5]
print(f"Original expression values: {expression}")

# Add new value
expression.append(3.3)
print(f"After adding 3.3: {expression}")

# Remove smallest value
min_val = min(expression)
expression.remove(min_val)
print(f"After removing minimum ({min_val}): {expression}")
print()

# Calculate statistics
print("Statistics:")
print(f"  Min: {min(expression)}")
print(f"  Max: {max(expression)}")
print(f"  Mean: {sum(expression)/len(expression):.2f}")
print()

# Sort list
expression.sort()
print(f"Sorted: {expression}")

# List slicing
first_three = expression[:3]
last_three = expression[-3:]
print(f"First 3: {first_three}")
print(f"Last 3: {last_three}")
print()

# List comprehension for normalization
max_expr = max(expression)
normalized = [x / max_expr for x in expression]
print(f"Normalized values: {[f'{x:.3f}' for x in normalized]}")

---

## Task 3: Dictionaries and Control Flow (25 points)

In [None]:
# Create cell count dictionary
cell_counts = {
    'T_cells': 1500,
    'B_cells': 800,
    'NK_cells': 300,
    'Monocytes': 600
}

# Add new cell type
cell_counts['Dendritic_cells'] = 150

# Calculate total and percentages
total_cells = sum(cell_counts.values())
print(f"Total cells: {total_cells}")
print("\nCell Type Distribution:")
print("="*50)

# Loop through and print with categorization
for cell_type, count in cell_counts.items():
    percentage = (count / total_cells) * 100
    
    # Categorize
    if count > 1000:
        category = "abundant"
    elif count >= 500:
        category = "moderate"
    else:
        category = "rare"
    
    print(f"{cell_type:20s}: {count:5d} cells ({percentage:5.1f}%) - {category}")

---

## Task 4: Writing Functions (25 points)

In [None]:
def calculate_qc_metrics(counts_list):
    """Calculate QC statistics for a list of counts."""
    if not counts_list:
        return None
    
    sorted_counts = sorted(counts_list)
    n = len(sorted_counts)
    
    # Calculate median
    if n % 2 == 0:
        median = (sorted_counts[n//2 - 1] + sorted_counts[n//2]) / 2
    else:
        median = sorted_counts[n//2]
    
    return {
        'mean': sum(counts_list) / len(counts_list),
        'median': median,
        'min': min(counts_list),
        'max': max(counts_list),
        'range': max(counts_list) - min(counts_list)
    }

def filter_genes(gene_dict, min_expression):
    """Filter genes by minimum expression level."""
    return {gene: expr for gene, expr in gene_dict.items() if expr >= min_expression}

# Test functions
test_counts = [10, 25, 15, 30, 20, 18, 22]
qc_results = calculate_qc_metrics(test_counts)
print("QC Metrics:")
for metric, value in qc_results.items():
    print(f"  {metric}: {value:.2f}")
print()

gene_expression = {'TP53': 45, 'BRCA1': 12, 'EGFR': 38, 'MYC': 8, 'KRAS': 52}
filtered = filter_genes(gene_expression, min_expression=20)
print(f"Original genes: {gene_expression}")
print(f"Filtered genes (>= 20): {filtered}")

---

## Task 5: Working with Nested Data Structures (15 points)

In [None]:
# Create nested dictionary
single_cell_data = {
    'CELL_001': {
        'cell_type': 'T cell',
        'gene_counts': {'CD3D': 45, 'CD8A': 38, 'IL7R': 12}
    },
    'CELL_002': {
        'cell_type': 'B cell',
        'gene_counts': {'CD79A': 52, 'CD19': 48, 'MS4A1': 41}
    },
    'CELL_003': {
        'cell_type': 'Monocyte',
        'gene_counts': {'CD14': 67, 'LYZ': 55, 'CST3': 43}
    }
}

# Access and display
print("Single-Cell Data Structure:")
print("="*60)
for cell_id, data in single_cell_data.items():
    print(f"{cell_id}: {data['cell_type']}")
    print(f"  Genes: {data['gene_counts']}")
print()

# Modify nested value
single_cell_data['CELL_001']['gene_counts']['CCR7'] = 35
print("Added CCR7 to CELL_001:")
print(f"  {single_cell_data['CELL_001']['gene_counts']}")
print()

# Calculate total expression per cell
print("Total Expression per Cell:")
print("="*60)
cell_totals = {}
for cell_id, data in single_cell_data.items():
    total = sum(data['gene_counts'].values())
    cell_totals[cell_id] = total
    print(f"{cell_id}: {total} (Type: {data['cell_type']})")
print()

# Find highest expression cell
highest_cell = max(cell_totals, key=cell_totals.get)
highest_total = cell_totals[highest_cell]
print(f"Cell with highest total expression: {highest_cell}")
print(f"Total expression: {highest_total}")
print(f"Cell type: {single_cell_data[highest_cell]['cell_type']}")

---

## Reflection Questions (Bonus: 10 points)

**1. Why are dictionaries particularly useful for biological data analysis?**

Dictionaries are ideal for biological data because:
- **Named access:** Access data by meaningful identifiers (gene names, cell IDs) rather than numeric indices
- **Flexible structure:** Can store heterogeneous data types (metadata, counts, annotations)
- **Fast lookups:** O(1) average time complexity for accessing elements
- **Natural representation:** Maps biological relationships (gene→expression, cell→type)
- **Easy to update:** Add/remove genes or cells dynamically

**2. How does list comprehension improve code readability and efficiency?**

List comprehension benefits:
- **Conciseness:** One-line expression vs multi-line loops
- **Readability:** Intent is clear and Pythonic
- **Performance:** Faster than traditional loops (optimized in C)
- **Functional style:** Promotes immutability and cleaner code
- Example: `[x/max_val for x in data]` vs explicit loop with append

**3. What are the advantages of using functions in data analysis?**

Functions provide:
- **Reusability:** Write once, use multiple times
- **Organization:** Break complex analysis into logical steps
- **Testing:** Easier to test and debug isolated units
- **Documentation:** Docstrings explain purpose and usage
- **Maintenance:** Changes in one place propagate everywhere
- **Abstraction:** Hide implementation details, expose clean interfaces