In [11]:
import pandas as pd  # For loading data
from sklearn.preprocessing import StandardScaler  # For standardizing the data
from sklearn.model_selection import train_test_split  # For splitting into train/test

In [12]:
import os

# This will print ALL files and folders in your input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)
    for filename in filenames:
        print("   ", filename)

/kaggle/input
/kaggle/input/gene-expression-cancer-rna-seq
/kaggle/input/gene-expression-cancer-rna-seq/TCGA-PANCAN-HiSeq-801x20531
/kaggle/input/gene-expression-cancer-rna-seq/TCGA-PANCAN-HiSeq-801x20531/TCGA-PANCAN-HiSeq-801x20531
    data.csv
    labels.csv


In [13]:
# Correct paths based on os.walk output - files are nested two levels deep!
data_path = "/kaggle/input/gene-expression-cancer-rna-seq/TCGA-PANCAN-HiSeq-801x20531/TCGA-PANCAN-HiSeq-801x20531/data.csv"
labels_path = "/kaggle/input/gene-expression-cancer-rna-seq/TCGA-PANCAN-HiSeq-801x20531/TCGA-PANCAN-HiSeq-801x20531/labels.csv"

X_full = pd.read_csv(data_path)  # Full gene expression data
y = pd.read_csv(labels_path)['Class']  # Cancer type labels

print("Success! Files loaded correctly.")
print("X_full shape:", X_full.shape)
print("y shape:", y.shape)
print("First few y values:", y.head().tolist())

Success! Files loaded correctly.
X_full shape: (801, 20532)
y shape: (801,)
First few y values: ['PRAD', 'LUAD', 'PRAD', 'PRAD', 'BRCA']


In [14]:
# Remove the 'Unnamed: 0' column which is just an index, not a real gene
X = X_full.drop(columns=['Unnamed: 0'])

# Check the result
print("Cleaned X shape:", X.shape)          # Should be (801, 20531)
print("First 3 column names:", X.columns[:3].tolist())

Cleaned X shape: (801, 20531)
First 3 column names: ['gene_0', 'gene_1', 'gene_2']


In [15]:
from sklearn.preprocessing import StandardScaler

# StandardScaler makes each gene have mean=0 and standard deviation=1
# This is necessary because genes have very different ranges/units
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Quick check that it worked
print("X_scaled shape:", X_scaled.shape)
print("First sample, first 5 genes after scaling:", X_scaled[0][:5].round(4))

X_scaled shape: (801, 20531)
First sample, first 5 genes after scaling: [-0.1948 -0.828   0.1598 -1.9483  1.2216]


In [16]:
from sklearn.model_selection import train_test_split

# Split: 80% for training, 20% for testing
# random_state=42 → same split every time (good for reproducibility)
# stratify=y → keeps the same proportion of cancer types in train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Show sizes
print("Training set:")
print("  X_train shape:", X_train.shape)
print("  y_train shape:", y_train.shape)
print("\nTest set:")
print("  X_test shape:", X_test.shape)
print("  y_test shape:", y_test.shape)

# Optional: check class distribution is similar
print("\nClass distribution in train:\n", y_train.value_counts(normalize=True).round(3))
print("Class distribution in test:\n", y_test.value_counts(normalize=True).round(3))

Training set:
  X_train shape: (640, 20531)
  y_train shape: (640,)

Test set:
  X_test shape: (161, 20531)
  y_test shape: (161,)

Class distribution in train:
 Class
BRCA    0.375
KIRC    0.181
LUAD    0.177
PRAD    0.170
COAD    0.097
Name: proportion, dtype: float64
Class distribution in test:
 Class
BRCA    0.373
KIRC    0.186
LUAD    0.174
PRAD    0.168
COAD    0.099
Name: proportion, dtype: float64


## Summary of Preprocessing

- Successfully loaded data from nested Kaggle input paths
- Features: removed 'Unnamed: 0' column → final X shape **(801 samples, 20531 genes)**
- Standardized all gene expression values using StandardScaler (mean=0, std=1 per gene)
- Split data into train/test sets:
  - **Train**: 640 samples (80%)
  - **Test**: 161 samples (20%)
- Used `stratify=y` → class distributions very similar in train and test:
  - BRCA ≈37.4–37.5%
  - KIRC ≈18.1–18.6%
  - LUAD ≈17.4–17.7%
  - PRAD ≈16.8–17.0%
  - COAD ≈9.7–9.9%

Data is now clean, scaled, and ready for dimensionality reduction (PCA) and model training.