In [None]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler  
from sklearn.model_selection import train_test_split 

In [None]:
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)
    for filename in filenames:
        print("   ", filename)

In [None]:
data_path = "/kaggle/input/gene-expression-cancer-rna-seq/TCGA-PANCAN-HiSeq-801x20531/TCGA-PANCAN-HiSeq-801x20531/data.csv"
labels_path = "/kaggle/input/gene-expression-cancer-rna-seq/TCGA-PANCAN-HiSeq-801x20531/TCGA-PANCAN-HiSeq-801x20531/labels.csv"

X_full = pd.read_csv(data_path)  
y = pd.read_csv(labels_path)['Class']  

print("Success! Files loaded correctly.")
print("X_full shape:", X_full.shape)
print("y shape:", y.shape)
print("First few y values:", y.head().tolist())

In [None]:
#removing the 'Unnamed: 0' column which is just an index(not a real gene)
X = X_full.drop(columns=['Unnamed: 0'])

print("Cleaned X shape:", X.shape)         
print("First 3 column names:", X.columns[:3].tolist())

In [None]:
from sklearn.preprocessing import StandardScaler

#StandardScaler makes genes have mean=0 and standard deviation=1 and we do this because genes have very different ranges
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("X_scaled shape:", X_scaled.shape)
print("First sample, first 5 genes after scaling:", X_scaled[0][:5].round(4))

In [None]:
from sklearn.model_selection import train_test_split

#split: 80% for training, 20% for testing
#random_state=42 uses same split every time 
#stratify=y keeps the same proportion of cancer types in train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training set:")
print("  X_train shape:", X_train.shape)
print("  y_train shape:", y_train.shape)
print("\nTest set:")
print("  X_test shape:", X_test.shape)
print("  y_test shape:", y_test.shape)

print("\nClass distribution in train:\n", y_train.value_counts(normalize=True).round(3))
print("Class distribution in test:\n", y_test.value_counts(normalize=True).round(3))

## Summary of Preprocessing

- Successfully loaded data from nested Kaggle input paths
- Features: removed 'Unnamed: 0' column → final X shape **(801 samples, 20531 genes)**
- Standardized all gene expression values using StandardScaler (mean=0, std=1 per gene)
- Split data into train/test sets:
  - **Train**: 640 samples (80%)
  - **Test**: 161 samples (20%)
- Used `stratify=y` → class distributions very similar in train and test:
  - BRCA ≈37.4–37.5%
  - KIRC ≈18.1–18.6%
  - LUAD ≈17.4–17.7%
  - PRAD ≈16.8–17.0%
  - COAD ≈9.7–9.9%

Data is now clean, scaled, and ready for dimensionality reduction (PCA) and model training.