<a href="https://colab.research.google.com/github/hammcoding/data-mining/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

file_path = 'data-01'
df = pd.read_csv(
    file_path,
    sep='\t',
    header=None,
    names=['Date', 'Time', 'Code', 'Value'],
    skipinitialspace=True
)


df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

print(f"Bentuk awal data: {df.shape}")
print("Contoh data mentah:\n", df.head())
print("-" * 50)

# TAHAP 1: FEATURE ENGINEERING (Mengubah Waktu Menjadi Fitur)


df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], errors='coerce')


df['DayOfWeek'] = df['DateTime'].dt.dayofweek
df['HourOfDay'] = df['DateTime'].dt.hour


df.drop(columns=['Date', 'Time', 'DateTime'], inplace=True)

print("Fitur waktu baru ditambahkan.")
print("Contoh data setelah feature engineering:\n", df.head())
print("-" * 50)



# TAHAP 2: DATA CLEANING (Menangani Missing Values/Outliers)

df.dropna(inplace=True)

X = df[['Code', 'DayOfWeek', 'HourOfDay']].values
y = df['Value'].values

print(f"Jumlah baris setelah dibersihkan: {len(df)}")
print("-" * 50)



# TAHAP 3: ENCODING DATA KATEGORI (Attributes)

ct = ColumnTransformer(
    transformers=[

        ('onehot', OneHotEncoder(handle_unknown='ignore'), [0, 1])
    ],
    remainder='passthrough'
)

X = ct.fit_transform(X)

if hasattr(X, 'toarray'):
    X = X.toarray()

print(f"Bentuk X setelah One-Hot Encoding: {X.shape}")
print("-" * 50)



# TAHAP 4: PEMBAGIAN DATA (Train/Test Split)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 42
)

print(f"Train Set Size (X_train): {X_train.shape[0]} baris")
print(f"Test Set Size (X_test): {X_test.shape[0]} baris")
print("-" * 50)


# TAHAP 5: FEATURE SCALING


sc = StandardScaler()


last_col_idx = X_train.shape[1] - 1

X_train[:, last_col_idx:] = sc.fit_transform(X_train[:, last_col_idx:])
X_test[:, last_col_idx:] = sc.transform(X_test[:, last_col_idx:])



y_train_scaled = sc.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test_scaled = sc.transform(y_test.reshape(-1, 1)).flatten()

print("Scaling selesai. Distribusi data siap untuk model.")
print(f"Contoh HourOfDay (Scaled) di X_train:\n{X_train[:5, last_col_idx:]}")
print(f"Contoh Target Value (Scaled) di y_train:\n{y_train_scaled[:5]}")
# =====================================================================

Bentuk awal data: (943, 4)
Contoh data mentah:
          Date   Time  Code  Value
0  04-21-1991   9:09    58    100
1  04-21-1991   9:09    33      9
2  04-21-1991   9:09    34     13
3  04-21-1991  17:08    62    119
4  04-21-1991  17:08    33      7
--------------------------------------------------
Fitur waktu baru ditambahkan.
Contoh data setelah feature engineering:
    Code  Value  DayOfWeek  HourOfDay
0    58    100          6          9
1    33      9          6          9
2    34     13          6          9
3    62    119          6         17
4    33      7          6         17
--------------------------------------------------
Jumlah baris setelah dibersihkan: 943
--------------------------------------------------
Bentuk X setelah One-Hot Encoding: (943, 15)
--------------------------------------------------
Train Set Size (X_train): 754 baris
Test Set Size (X_test): 189 baris
--------------------------------------------------
Scaling selesai. Distribusi data siap untuk mo