# Persiapan

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = '../../Data/wbc.csv'  # Make sure this file is in the same directory as your script
data = pd.read_csv(file_path)
print("Data loaded successfully.")

Data loaded successfully.


# No. 1

In [2]:
# Step 1: Remove the 'id' and 'Unnamed: 32' columns (which is mostly empty)
data = data.drop(columns=['id', 'Unnamed: 32'])
print("Removed 'id' and 'Unnamed: 32' columns.")
print(data.head())  # Output

Removed 'id' and 'Unnamed: 32' columns.
  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0         M        17.99         10.38          122.80     1001.0   
1         M        20.57         17.77          132.90     1326.0   
2         M        19.69         21.25          130.00     1203.0   
3         M        11.42         20.38           77.58      386.1   
4         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0

# No. 2

In [18]:
# Step 2: Encode the 'diagnosis' column (M -> 1, B -> 0)
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
print("Encoded 'diagnosis' column.")
print(data.head())  # Output

Encoded 'diagnosis' column.
   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          1        17.99         10.38          122.80     1001.0   
1          1        20.57         17.77          132.90     1326.0   
2          1        19.69         21.25          130.00     1203.0   
3          1        11.42         20.38           77.58      386.1   
4          1        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.2419 

# No. 3

In [21]:
# Step 3: Standardize all numerical columns (excluding the diagnosis column)
X = data.drop(columns=['diagnosis'])  # All features
y = data['diagnosis']  # Target

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
print("Standardized numerical columns.")
print(X_scaled.head())  # Output

Standardized numerical columns.
   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0     1.097064     -2.073335        1.269934   0.984375         1.568466   
1     1.829821     -0.353632        1.685955   1.908708        -0.826962   
2     1.579888      0.456187        1.566503   1.558884         0.942210   
3    -0.768909      0.253732       -0.592687  -0.764464         3.283553   
4     1.750297     -1.151816        1.776573   1.826229         0.280372   

   compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
0          3.283515        2.652874             2.532475       2.217515   
1         -0.487072       -0.023846             0.548144       0.001392   
2          1.052926        1.363478             2.037231       0.939685   
3          3.402909        1.915897             1.451707       2.867383   
4          0.539340        1.371011             1.428493      -0.009560   

   fractal_dimension_mean  ...  radius_worst  texture_worst 

# No. 4

In [30]:
# Step 4: Stratified split for train and test data (80:20 ratio)
from sklearn.model_selection import train_test_split

X_train, X_unseen, y_train, y_unseen = train_test_split(X_scaled, y, test_size=0.2, random_state=0, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X_unseen, y_unseen, test_size=0.5, random_state=0, stratify=y_unseen)

print(f'Jumlah label data asli:\n{y.value_counts()}')
print(f'Jumlah label data train:\n{y_train.value_counts()}')
print(f'Jumlah label data val:\n{y_val.value_counts()}')
print(f'Jumlah label data test:\n{y_test.value_counts()}')

Jumlah label data asli:
diagnosis
0    357
1    212
Name: count, dtype: int64
Jumlah label data train:
diagnosis
0    285
1    170
Name: count, dtype: int64
Jumlah label data val:
diagnosis
0    36
1    21
Name: count, dtype: int64
Jumlah label data test:
diagnosis
0    36
1    21
Name: count, dtype: int64
