# 🎯 SOLUSI PRAKTIKUM 2: NUMPY

### DATA PREPROCESSING DENGAN NUMPY ###
<a href="https://colab.research.google.com/github/pakizhan-ump/ml-umpontianak/blob/main/Modules/Week-02/Praktikum-02/Solusi/solusi_praktikum_2_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

In [None]:
'''SOLUSI: Implementasi Preprocessing Pipeline'''
# Dataset simulasi: 100 samples, 5 features
np.random.seed(42)
X = np.random.randn(100, 5) * 10 + 5  # Mean=5, Std=10

# SOLUSI 1: Normalisasi Z-score: (x - mean) / std
def z_score_normalization(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    normalized = (data - mean) / std
    return normalized

X_normalized = z_score_normalization(X)

# SOLUSI 2: Handle outliers - replace values beyond 3 std with boundaries
def handle_outliers(data, std_threshold=3):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    
    lower_bound = mean - std_threshold * std
    upper_bound = mean + std_threshold * std
    
    # Clip values to boundaries
    cleaned = np.clip(data, lower_bound, upper_bound)
    return cleaned

X_cleaned = handle_outliers(X_normalized)

# SOLUSI 3: One-hot encoding untuk label kategorikal
def one_hot_encoding(labels):
    n_classes = len(np.unique(labels))
    n_samples = len(labels)
    
    one_hot = np.zeros((n_samples, n_classes))
    for i, label in enumerate(labels):
        one_hot[i, label] = 1
    
    return one_hot

labels = np.array([0, 1, 2, 0, 1, 2, 0])
one_hot_labels = one_hot_encoding(labels)

# SOLUSI 4: Train-test split manual
def train_test_split_numpy(X, y, test_size=0.2):
    n_samples = X.shape[0]
    n_test = int(n_samples * test_size)
    
    # Shuffle indices
    indices = np.random.permutation(n_samples)
    
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    
    return X_train, X_test, y_train, y_test

# Generate sample labels
y = np.random.randint(0, 3, 100)
X_train, X_test, y_train, y_test = train_test_split_numpy(X, y, test_size=0.2)

print("Original data shape:", X.shape)
print("Normalized data mean:", np.mean(X_normalized, axis=0).round(3))
print("Normalized data std:", np.std(X_normalized, axis=0).round(3))
print("One-hot labels shape:", one_hot_labels.shape)
print("One-hot labels:\n", one_hot_labels)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# TEST ASSERTIONS
assert X_normalized.shape == X.shape, "Shape should remain same"
assert np.allclose(X_normalized.mean(axis=0), 0, atol=1e-10), "Mean should be ~0 after z-score"
assert np.allclose(X_normalized.std(axis=0), 1, atol=1e-10), "Std should be ~1 after z-score"

print("\n✅ Semua assertions berhasil! Solusi benar.")