# Cancer Gene Expression Classifier — EDA & Baseline Model

This notebook performs:
1) Basic EDA on RNA-seq expression matrix  
2) Preprocessing (log transform, scaling, feature selection)  
3) Baseline model training (Logistic Regression + Random Forest)  
4) Evaluation (accuracy, classification report, ROC-AUC, confusion matrix)  
5) Save artifacts (processed data, trained model, figures)

> **Data expectation:** A CSV in `data/raw/` with shape `(samples × features)` where one column is the cancer **label** (e.g., `label`), and all other columns are gene expression features. Update the `RAW_DATA_PATH` and `LABEL_COL` below if your file names differ.


In [None]:
# Paths & settings
from pathlib import Path

PROJECT_DIR = Path("/mnt/data/Cancer_Gene_Expression_Classifier")
RAW_DATA_PATH = PROJECT_DIR / "data" / "raw" / "tcga_expression.csv"  # <-- put your Kaggle CSV here
PROCESSED_DIR = PROJECT_DIR / "data" / "processed"
RESULTS_FIG_DIR = PROJECT_DIR / "results" / "figures"
MODEL_DIR = PROJECT_DIR / "results" / "models"

LABEL_COL = "label"  # <-- change to your label column name
TEST_SIZE = 0.2
RANDOM_STATE = 42

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_FIG_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

RAW_DATA_PATH, PROCESSED_DIR, RESULTS_FIG_DIR, MODEL_DIR


In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import joblib


In [None]:
# Load data
df = pd.read_csv(RAW_DATA_PATH)
print(df.shape)
df.head()

## Quick EDA

In [None]:
# Class balance
class_counts = df[LABEL_COL].value_counts()
print(class_counts)
class_counts.plot(kind='bar', title='Class counts')
plt.tight_layout()
plt.savefig(RESULTS_FIG_DIR / "class_counts.png", dpi=200)
plt.show()

In [None]:
# Basic stats of numeric features (first 10 columns shown for brevity)
num_df = df.drop(columns=[LABEL_COL], errors='ignore').select_dtypes(include=[np.number])
desc = num_df.iloc[:, :10].describe().T
desc

In [None]:
# Missing values overview
missing = df.isna().sum().sort_values(ascending=False)
missing.head(20)

## Preprocessing
- Split train/test (stratified)
- Log1p transform expression (optional)
- Variance filter to drop near-constant genes
- Standard scaling
- Save processed splits


In [None]:
# Split
X = df.drop(columns=[LABEL_COL])
y = df[LABEL_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# Log1p transform numeric features
X_train = np.log1p(X_train)
X_test  = np.log1p(X_test)

# Variance threshold (remove near-constant features)
vt = VarianceThreshold(threshold=0.0)
X_train_vt = vt.fit_transform(X_train)
X_test_vt  = vt.transform(X_test)

# Standardize
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_s = scaler.fit_transform(X_train_vt)
X_test_s  = scaler.transform(X_test_vt)

X_train_s.shape, X_test_s.shape

In [None]:
# Save processed data (optional previews as CSV)
pd.DataFrame(X_train_s).to_csv(PROCESSED_DIR / "X_train.csv", index=False)
pd.DataFrame(X_test_s).to_csv(PROCESSED_DIR / "X_test.csv", index=False)
pd.Series(y_train).to_csv(PROCESSED_DIR / "y_train.csv", index=False)
pd.Series(y_test).to_csv(PROCESSED_DIR / "y_test.csv", index=False)

# Save transformers
joblib.dump(vt, PROCESSED_DIR / "variance_threshold.pkl")
joblib.dump(scaler, PROCESSED_DIR / "scaler.pkl")

## Dimensionality reduction (PCA) for visualization

In [None]:
pca = PCA(n_components=2, random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train_s)

plt.figure()
plt.scatter(X_train_pca[:,0], X_train_pca[:,1])
plt.title("PCA (train)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.savefig(RESULTS_FIG_DIR / "pca_train.png", dpi=200)
plt.show()

print("Explained variance (2 PCs):", pca.explained_variance_ratio_.sum())

## Baseline models

In [None]:
# Logistic Regression
logreg = LogisticRegression(max_iter=2000, n_jobs=None)  # multi-class handled via softmax by default
logreg.fit(X_train_s, y_train)
y_pred_lr = logreg.predict(X_test_s)
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression accuracy:", acc_lr)

# Random Forest
rf = RandomForestClassifier(n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train_s, y_train)
y_pred_rf = rf.predict(X_test_s)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest accuracy:", acc_rf)

In [None]:
# Detailed classification report
print("== Logistic Regression ==")
print(classification_report(y_test, y_pred_lr))
print("\n== Random Forest ==")
print(classification_report(y_test, y_pred_rf))

In [None]:
# Confusion matrix (RF)
import itertools

cm = confusion_matrix(y_test, y_pred_rf, labels=sorted(y.unique()))
plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix (Random Forest)")
plt.colorbar()
tick_marks = range(len(sorted(y.unique())))
plt.xticks(tick_marks, sorted(y.unique()), rotation=90)
plt.yticks(tick_marks, sorted(y.unique()))
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], 'd'),
             horizontalalignment="center")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.savefig(RESULTS_FIG_DIR / "confusion_matrix_rf.png", dpi=200)
plt.show()

In [None]:
# Multi-class ROC-AUC (One-vs-Rest)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

classes = sorted(y.unique())
y_test_bin = label_binarize(y_test, classes=classes)

# For RF, need predict_proba
y_proba_rf = rf.predict_proba(X_test_s)
try:
    auc_ovr = roc_auc_score(y_test_bin, y_proba_rf, average="macro", multi_class="ovr")
    print("Random Forest ROC-AUC (macro OVR):", auc_ovr)
except Exception as e:
    print("ROC-AUC could not be computed:", e)

## Save models

In [None]:
joblib.dump(logreg, MODEL_DIR / "logreg.joblib")
joblib.dump(rf, MODEL_DIR / "random_forest.joblib")
print("Saved models to:", MODEL_DIR)

## Next steps
- Hyperparameter tuning (GridSearchCV / Optuna)
- Add class-weighting if class imbalance is large
- Try gradient boosting models
- Feature importance analysis and pathway enrichment (e.g., using top genes by importance)
- Export a clean `train.py` CLI script for reproducible runs
