In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif

def calculate_gain_ratio(X, y):
    """
    Calculate the Gain Ratio for each feature.
    """
    info_gain = mutual_info_classif(X, y)
    intrinsic_value = -np.sum(X * np.log2(X + 1e-9), axis=0)
    gain_ratio = info_gain / (intrinsic_value + 1e-9)
    return gain_ratio

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Dataset/Python_LargeClassSmell_Dataset.csv')
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

gain_ratio = calculate_gain_ratio(X.values, y)
features_gain_ratio = pd.DataFrame({'Feature': X.columns, 'Gain_Ratio': gain_ratio})

average_gain_ratio = gain_ratio.mean()
selected_features = features_gain_ratio[features_gain_ratio['Gain_Ratio'] >= average_gain_ratio]
X_selected = X[selected_features['Feature'].values]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_selected)
X_scaled_df = pd.DataFrame(X_scaled, columns=selected_features['Feature'].values)

X_scaled_df['target'] = y.values
X_scaled_df.to_csv('Python_LargeClassSmell_Dataset_NormalizedAndGainRatio.csv', index=False)
print("Scaled dataset saved as 'Python_LargeClassSmell_Dataset_NormalizedAndGainRatio.csv'")

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Dataset/Python_LongMethodSmell_Dataset.csv')
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

gain_ratio = calculate_gain_ratio(X.values, y)
features_gain_ratio = pd.DataFrame({'Feature': X.columns, 'Gain_Ratio': gain_ratio})

average_gain_ratio = gain_ratio.mean()
selected_features = features_gain_ratio[features_gain_ratio['Gain_Ratio'] >= average_gain_ratio]
X_selected = X[selected_features['Feature'].values]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_selected)
X_scaled_df = pd.DataFrame(X_scaled, columns=selected_features['Feature'].values)

X_scaled_df['target'] = y.values
X_scaled_df.to_csv('Python_LongMethodSmell_Dataset_NormalizedAndGainRatio.csv', index=False)
print("Scaled dataset saved as 'Python_LongMethodSmell_Dataset_NormalizedAndGainRatio.csv'")