In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [2]:
# Function to load data from a CSV file
def load_data(file_path):
    dataframe = pd.read_csv(file_path)
    return dataframe

# Load your data
dataframe = load_data("..\data\data_1.csv")

In [3]:
target = "quality"
X = dataframe.drop([target, "Id"], axis=1)
Y = dataframe[target]


Standardization : mean 0 and variance 1

In [4]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Data Augmentation : SMOTE

In [5]:
# Apply SMOTE to balance the training data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, Y_resampled = smote.fit_resample(X_scaled, Y)

In [6]:
# Check the original distribution of classes
print("Original class distribution:", Counter(Y))
# Check the new distribution of classes after SMOTE
print("Class distribution after SMOTE:", Counter(Y_resampled))

Original class distribution: Counter({5: 483, 6: 462, 7: 143, 4: 33, 8: 16, 3: 6})
Class distribution after SMOTE: Counter({5: 483, 6: 483, 7: 483, 4: 483, 8: 483, 3: 483})


In [7]:
# Create a new DataFrame from the resampled data
augmented_data = pd.DataFrame(X_resampled, columns=X.columns)
augmented_data[target] = Y_resampled

In [8]:
# Shuffle the augmented dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
# Save the augmented DataFrame to a new CSV file
augmented_data.to_csv(r'..\data\data_1_processed.csv', index=False)
print("Augmented data saved to ..\\data\\data_1_processed.csv")

Augmented data saved to ..\data\data_1_processed.csv
