# 🧬 Notebook 1: Data Loading and Preprocessing

This notebook loads and cleans the gene expression dataset (GSE25066) to prepare it for dimensionality reduction and clustering.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set display options
pd.set_option('display.max_columns', 100)
sns.set(style='whitegrid')

# Create data directory if not exists
os.makedirs('../data/processed', exist_ok=True)


## 🔽 Load Dataset

In [None]:
# For demonstration, we'll simulate a gene expression matrix
# In practice, download from GEO manually or use GEOparse if available

# Simulate data: 1000 genes x 50 samples
np.random.seed(42)
expression_data = pd.DataFrame(
    np.random.randn(1000, 50),
    index=[f"Gene_{i}" for i in range(1000)],
    columns=[f"Sample_{j}" for j in range(50)]
)

# Save for future use
expression_data.to_csv('../data/processed/gene_expression_matrix.csv')

# Preview
expression_data.head()


## 🧼 Data Cleaning & Exploration

In [None]:
# Check for missing values
print("Missing values:
", expression_data.isnull().sum().sum())

# Basic stats
print("Shape:", expression_data.shape)
print("Mean expression per gene (first 5):")
expression_data.mean(axis=1).head()


## 🔥 Heatmap of Sample Correlations

In [None]:
# Correlation between samples
corr_matrix = expression_data.corr()

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, cmap='coolwarm')
plt.title("Sample Correlation Heatmap")
plt.show()
