In [None]:
# Import library
from google.colab import drive

# Mount  Google Drive /content/drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import library Pandas
import pandas as pd

In [None]:
#read file CSV
data = pd.read_csv('/content/drive/MyDrive/Dataset/data_normalization.csv')
data.head()

Unnamed: 0,pH,TSS,DO,BOD,COD,Nitrat,FecalColiform,Fosfat,IP,Class
0,0.578956,0.019169,0.783274,0.030936,0.003576,0.205533,0.001842,0.002419,5.12,3
1,0.623277,0.025559,0.794816,0.068394,0.038253,0.352586,0.003038,0.000419,5.91,3
2,0.512476,0.031949,0.462673,0.107343,0.097854,0.237999,0.009777,0.01411,7.76,3
3,0.54941,0.038339,0.406247,0.120947,0.119528,0.23036,0.032603,0.003295,9.61,3
4,0.431222,0.003195,0.874324,0.023295,0.0,0.197894,0.000321,0.006608,2.44,2


In [None]:
import statsmodels.api as sm
import pandas as pd

def forward_selection(X, y, alpha=0.05):
    selected_features = []
    pvalues_at_each_step = []  # Added to store p-values at each step
    remaining_features = set(X.columns)

    while remaining_features:
        best_pvalue = float('inf')
        best_feature = None

        for feature in remaining_features:
            model = sm.OLS(y, sm.add_constant(X[selected_features + [feature]])).fit()
            pvalue = model.pvalues[feature]

            if pvalue < best_pvalue:
                best_pvalue = pvalue
                best_feature = feature

        if best_pvalue < alpha:  # Allow customization of significance level
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)

            # Store p-value at each step
            pvalues_at_each_step.append((best_feature, best_pvalue))

            # Print the output at each step
            print(f"Selected Features: {selected_features}")
            print(f"Best Feature: {best_feature}")
            print(f"Best P-value: {best_pvalue}")
            print("----")
        else:
            break

    return selected_features, pvalues_at_each_step

# Your data
X = data[['pH', 'TSS', 'DO', 'BOD', 'COD', 'Nitrat', 'FecalColiform', 'Fosfat', 'IP']]
y = data['Class']

# Example usage
selected_features, pvalues_at_each_step = forward_selection(X, y, alpha=0.01)  # Example with alpha=0.01
print("Final Selected Features:", selected_features)
print("P-values at Each Step:", pvalues_at_each_step)


Selected Features: ['IP']
Best Feature: IP
Best P-value: 0.0
----
Selected Features: ['IP', 'BOD']
Best Feature: BOD
Best P-value: 1.6323411796494766e-26
----
Selected Features: ['IP', 'BOD', 'COD']
Best Feature: COD
Best P-value: 1.3790394904855748e-29
----
Selected Features: ['IP', 'BOD', 'COD', 'FecalColiform']
Best Feature: FecalColiform
Best P-value: 3.740826979044184e-22
----
Final Selected Features: ['IP', 'BOD', 'COD', 'FecalColiform']
P-values at Each Step: [('IP', 0.0), ('BOD', 1.6323411796494766e-26), ('COD', 1.3790394904855748e-29), ('FecalColiform', 3.740826979044184e-22)]


In [None]:
import statsmodels.api as sm
import pandas as pd

def backward_selection(X, y, alpha=0.05):
    features = list(X.columns)

    while len(features) > 0:
        model = sm.OLS(y, sm.add_constant(X[features])).fit()
        max_pvalue = model.pvalues.drop('const').max()

        if max_pvalue >= alpha:  # Hapus fitur jika p-value terbesar >= alpha
            remove_feature = model.pvalues.idxmax()
            features.remove(remove_feature)
        else:
            break

    return features

# Your data
X = data[['pH', 'TSS', 'DO', 'BOD', 'COD', 'Nitrat', 'FecalColiform', 'Fosfat', 'IP']]
y = data['Class']

# Example usage
selected_features_backward = backward_selection(X, y, alpha=0.01)  # Example with alpha=0.01
X_backward_selected = X[selected_features_backward]
print("Final Selected Features:", selected_features_backward)
print("X_backward_selected:")
print(X_backward_selected.head())


Final Selected Features: ['BOD', 'COD', 'FecalColiform', 'IP']
X_backward_selected:
        BOD       COD  FecalColiform    IP
0  0.030936  0.003576       0.001842  5.12
1  0.068394  0.038253       0.003038  5.91
2  0.107343  0.097854       0.009777  7.76
3  0.120947  0.119528       0.032603  9.61
4  0.023295  0.000000       0.000321  2.44


In [None]:
X = data[['BOD', 'COD', 'FecalColiform', 'IP']]
y = data['Class']

# Menampilkan beberapa baris pertama dari X dan y untuk melihat data
print("Fitur (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())

Fitur (X):
        BOD       COD  FecalColiform    IP
0  0.030936  0.003576       0.001842  5.12
1  0.068394  0.038253       0.003038  5.91
2  0.107343  0.097854       0.009777  7.76
3  0.120947  0.119528       0.032603  9.61
4  0.023295  0.000000       0.000321  2.44

Target (y):
0    3
1    3
2    3
3    3
4    2
Name: Class, dtype: int64


In [None]:
# Import library untuk pembagian data menjadi data pelatihan dan pengujian
from sklearn.model_selection import train_test_split

# Memisahkan data menjadi data pelatihan dan pengujian dengan rasio 10% pengujian dan 90% pelatihan
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)

# Menampilkan distribusi kelas dalam set pelatihan
print("Distribusi Kelas dalam Set Pelatihan:")
print(y_train.value_counts())

# Menampilkan distribusi kelas dalam set pengujian
print("\nDistribusi Kelas dalam Set Pengujian:")
print(y_test.value_counts())

Distribusi Kelas dalam Set Pelatihan:
Class
2    142
1    103
3      5
Name: count, dtype: int64

Distribusi Kelas dalam Set Pengujian:
Class
2    1354
1     845
3      34
4      17
Name: count, dtype: int64


In [None]:
# Save features X to CSV
X.to_csv('X.csv', index=False)

# Save target y to CSV
y.to_csv('y.csv', index=False, header=True)
