# CNN

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Dropout, Input, BatchNormalization, GlobalAveragePooling1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from uuid import uuid4

# Thiết lập seed để tái lập
np.random.seed(42)
tf.random.set_seed(42)

# Hàm tính các chỉ số đánh giá
def calculate_mape(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def calculate_mmre(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.mean(np.abs(((y_true[mask] - y_pred[mask]) / y_true[mask])))

def calculate_mdmre(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

def calculate_pred25(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    mre = np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])
    return np.mean(mre <= 0.25) * 100

# Đọc dữ liệu đã tiền xử lý
df = pd.read_csv('albrecht_cleaned.csv')

# Kiểm tra dữ liệu
print("=== Kiểm tra dữ liệu ===")
print("Kích thước:", df.shape)
print("Các cột:", df.columns.tolist())
print("Mẫu 5 hàng đầu tiên:")
print(df.head())
print("\nThông tin dữ liệu:")
print(df.info())

# Chọn đặc trưng và biến mục tiêu
features = [col for col in df.columns if col not in ['Effort']]
X = df[features].values
y = df['Effort'].values  # Sử dụng Effort đã được scale từ file test.csv

# Tăng cường dữ liệu bằng nhiễu Gaussian
def add_gaussian_noise(X, noise_factor=0.05):
    noise = np.random.normal(loc=0, scale=noise_factor, size=X.shape)
    return X + noise

X_augmented = X.copy()
y_augmented = y.copy()
for _ in range(2):  # Tạo thêm 2 bản sao với nhiễu
    X_noisy = add_gaussian_noise(X, noise_factor=0.05)
    X_augmented = np.vstack((X_augmented, X_noisy))
    y_augmented = np.hstack((y_augmented, y))

print("\n=== Sau khi tăng cường dữ liệu bằng nhiễu Gaussian ===")
print("X_augmented shape:", X_augmented.shape)
print("y_augmented shape:", y_augmented.shape)

# Reshape dữ liệu thành dạng (samples, features, 1) cho Conv1D
X_augmented = X_augmented.reshape(X_augmented.shape[0], X_augmented.shape[1], 1)

print("\n=== Kích thước dữ liệu sau reshape ===")
print("X_augmented shape:", X_augmented.shape)
print("y_augmented shape:", y_augmented.shape)

# Chia tập train/test
X_train, X_test, y_train, y_test = train_test_split(X_augmented, y_augmented, test_size=0.15, random_state=42)

print(f"\n✅ Kích thước dữ liệu CNN:")
print(f" - X_train: {X_train.shape}")
print(f" - X_test : {X_test.shape}")

# Xây dựng mô hình CNN với Conv1D, BatchNormalization, GlobalAveragePooling1D
def build_cnn_model(filters=8, kernel_size=2, l2_reg=0.01, dense_units=16, dropout_rate=0.3, learning_rate=0.001):
    l2_reg = max(l2_reg, 0.001)
    model = Sequential([
        Input(shape=(X_train.shape[1], X_train.shape[2])),
        Conv1D(filters, kernel_size, activation='relu', padding='same', kernel_regularizer=l2(l2_reg)),
        BatchNormalization(),
        Conv1D(filters, kernel_size, activation='relu', padding='same', kernel_regularizer=l2(l2_reg)),
        BatchNormalization(),
        GlobalAveragePooling1D(),
        Dense(dense_units, activation='relu', kernel_regularizer=l2(l2_reg)),
        Dropout(dropout_rate),
        Dense(1, activation='linear')
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=['mae'])
    return model

# Không gian siêu tham số
param_bounds = {
    'filters': (4, 32),
    'kernel_size': (1, 3),
    'l2_reg': (0.001, 0.1),
    'dense_units': (8, 64),
    'dropout_rate': (0.2, 0.5),
    'learning_rate': (1e-4, 1e-2),
    'batch_size': (8, 32),
    'epochs': (50, 150)
}

# Hàm mã hóa & giải mã particle
def random_particle():
    return np.array([
        np.random.randint(param_bounds['filters'][0], param_bounds['filters'][1] + 1),
        np.random.randint(param_bounds['kernel_size'][0], param_bounds['kernel_size'][1] + 1),
        np.random.uniform(param_bounds['l2_reg'][0], param_bounds['l2_reg'][1]),
        np.random.randint(param_bounds['dense_units'][0], param_bounds['dense_units'][1] + 1),
        np.random.uniform(param_bounds['dropout_rate'][0], param_bounds['dropout_rate'][1]),
        np.random.uniform(param_bounds['learning_rate'][0], param_bounds['learning_rate'][1]),
        np.random.randint(param_bounds['batch_size'][0], param_bounds['batch_size'][1] + 1),
        np.random.randint(param_bounds['epochs'][0], param_bounds['epochs'][1] + 1)
    ])

def decode_particle(particle):
    params = {
        'filters': int(particle[0]),
        'kernel_size': int(particle[1]),
        'l2_reg': particle[2],
        'dense_units': int(particle[3]),
        'dropout_rate': particle[4],
        'learning_rate': particle[5],
        'batch_size': int(particle[6]),
        'epochs': int(particle[7])
    }
    # Đảm bảo l2_reg không âm
    params['l2_reg'] = max(params['l2_reg'], 0.001)
    params['l2_reg'] = min(params['l2_reg'], param_bounds['l2_reg'][1])  # Giới hạn trên
    return params

# Hàm fitness cho PSO
def fitness_function(particle):
    params = decode_particle(particle)
    model = build_cnn_model(**{k: v for k, v in params.items() if k != 'batch_size' and k != 'epochs'})
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
        
        model.fit(X_tr, y_tr, epochs=params['epochs'], batch_size=params['batch_size'], 
                validation_split=0.2, verbose=0, callbacks=[early_stopping, reduce_lr])
        y_pred = model.predict(X_val, verbose=0)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    return np.mean(rmse_scores)

# Triển khai PSO
def run_pso_cnn(num_particles=10, max_iter=15):
    dim = len(param_bounds)
    bounds_array = np.array(list(param_bounds.values()))
    
    particles = [random_particle() for _ in range(num_particles)]
    velocities = [np.zeros(dim) for _ in range(num_particles)]
    
    p_best_positions = particles.copy()
    p_best_scores = [fitness_function(p) for p in particles]
    
    g_best_index = np.argmin(p_best_scores)
    g_best_position = p_best_positions[g_best_index]
    g_best_score = p_best_scores[g_best_index]
    
    w, c1, c2 = 0.5, 1.5, 1.5
    
    for iter in range(max_iter):
        print(f"\n🔁 Iteration {iter + 1}/{max_iter}")
        for i in range(num_particles):
            r1 = np.random.rand(dim)
            r2 = np.random.rand(dim)
            
            velocities[i] = (
                w * velocities[i]
                + c1 * r1 * (p_best_positions[i] - particles[i])
                + c2 * r2 * (g_best_position - particles[i])
            )
            
            particles[i] += velocities[i]
            particles[i] = np.clip(particles[i], bounds_array[:, 0], bounds_array[:, 1])
            # Đảm bảo l2_reg không âm và trong giới hạn
            particles[i][2] = max(particles[i][2], param_bounds['l2_reg'][0])
            particles[i][2] = min(particles[i][2], param_bounds['l2_reg'][1])
            particles[i][4] = np.clip(particles[i][4], param_bounds['dropout_rate'][0], param_bounds['dropout_rate'][1])
            
            score = fitness_function(particles[i])
            
            if score < p_best_scores[i]:
                p_best_scores[i] = score
                p_best_positions[i] = particles[i]
                
            if score < g_best_score:
                g_best_score = score
                g_best_position = particles[i]
                print(f"✅ Cập nhật g_best: Score = {g_best_score:.4f}")
    
    return g_best_position, g_best_score

# Chạy PSO
print("🚀 Chạy PSO để tìm siêu tham số tối ưu...")
best_particle, best_score = run_pso_cnn(num_particles=10, max_iter=15)
best_params = decode_particle(best_particle)
print(f"🏆 Siêu tham số tốt nhất: {best_params}")
print(f"📉 Score tốt nhất: {best_score:.4f}")

# Huấn luyện mô hình tối ưu
model_optimal = build_cnn_model(**{k: v for k, v in best_params.items() if k != 'batch_size' and k != 'epochs'})
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores_optimal = []
history = None

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\n📂 Fold {fold + 1}/5")
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    history = model_optimal.fit(X_tr, y_tr, epochs=best_params['epochs'], batch_size=best_params['batch_size'], 
                            validation_split=0.2, verbose=0, callbacks=[early_stopping, reduce_lr])
    y_pred = model_optimal.predict(X_val, verbose=0)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores_optimal.append(rmse)
    print(f"✅ Fold {fold + 1} RMSE: {rmse:.4f}")

print(f"\n📊 RMSE trung bình qua 5 folds: {np.mean(rmse_scores_optimal):.4f}")

# Đánh giá trên tập test
y_pred = model_optimal.predict(X_test, verbose=0).flatten()

# Tính các chỉ số đánh giá trên giá trị đã scale
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = calculate_mape(y_test, y_pred)
mmre = calculate_mmre(y_test, y_pred)
mdmre = calculate_mdmre(y_test, y_pred)
pred25 = calculate_pred25(y_test, y_pred)

# Đánh giá bootstrap
n_bootstraps = 500
bootstrap_metrics = {'mse': [], 'mae': [], 'r2': [], 'mape': [], 'mmre': [], 'mdmre': [], 'pred25': []}

for _ in range(n_bootstraps):
    indices = np.random.choice(len(y_test), len(y_test), replace=True)
    y_test_boot = y_test[indices]
    y_pred_boot = y_pred[indices]
    bootstrap_metrics['mse'].append(mean_squared_error(y_test_boot, y_pred_boot))
    bootstrap_metrics['mae'].append(mean_absolute_error(y_test_boot, y_pred_boot))
    bootstrap_metrics['r2'].append(r2_score(y_test_boot, y_pred_boot))
    bootstrap_metrics['mape'].append(calculate_mape(y_test_boot, y_pred_boot))
    bootstrap_metrics['mmre'].append(calculate_mmre(y_test_boot, y_pred_boot))
    bootstrap_metrics['mdmre'].append(calculate_mdmre(y_test_boot, y_pred_boot))
    bootstrap_metrics['pred25'].append(calculate_pred25(y_test_boot, y_pred_boot))

# In kết quả
print("\n📈 Kết quả đánh giá bootstrap (trên giá trị đã scale):")
print(f"📌 MSE     : {np.mean(bootstrap_metrics['mse']):.4f} ± {np.std(bootstrap_metrics['mse']):.4f}")
print(f"📌 RMSE    : {np.mean(np.sqrt(bootstrap_metrics['mse'])):.4f} ± {np.std(np.sqrt(bootstrap_metrics['mse'])):.4f}")
print(f"📌 MAE     : {np.mean(bootstrap_metrics['mae']):.4f} ± {np.std(bootstrap_metrics['mae']):.4f}")
print(f"📌 R²      : {np.mean(bootstrap_metrics['r2']):.4f} ± {np.std(bootstrap_metrics['r2']):.4f}")
print(f"📌 MAPE    : {np.mean(bootstrap_metrics['mape']):.2f}% ± {np.std(bootstrap_metrics['mape']):.2f}%")
print(f"📌 MMRE    : {np.mean(bootstrap_metrics['mmre']):.4f} ± {np.std(bootstrap_metrics['mmre']):.4f}")
print(f"📌 MdMRE   : {np.mean(bootstrap_metrics['mdmre']):.4f} ± {np.std(bootstrap_metrics['mdmre']):.4f}")
print(f"📌 PRED(25): {np.mean(bootstrap_metrics['pred25']):.2f}% ± {np.std(bootstrap_metrics['pred25']):.2f}%")

# Lưu kết quả đánh giá
results = {
    'MSE': mse,
    'RMSE': rmse,
    'MAE': mae,
    'R2': r2,
    'MAPE': mape,
    'MMRE': mmre,
    'MdMRE': mdmre,
    'PRED(25)': pred25,
    'Bootstrap_MSE_Mean': np.mean(bootstrap_metrics['mse']),
    'Bootstrap_MSE_Std': np.std(bootstrap_metrics['mse']),
    'Bootstrap_MAE_Mean': np.mean(bootstrap_metrics['mae']),
    'Bootstrap_MAE_Std': np.std(bootstrap_metrics['mae']),
    'Bootstrap_R2_Mean': np.mean(bootstrap_metrics['r2']),
    'Bootstrap_R2_Std': np.std(bootstrap_metrics['r2']),
    'Bootstrap_MAPE_Mean': np.mean(bootstrap_metrics['mape']),
    'Bootstrap_MAPE_Std': np.std(bootstrap_metrics['mape']),
    'Bootstrap_MMRE_Mean': np.mean(bootstrap_metrics['mmre']),
    'Bootstrap_MMRE_Std': np.std(bootstrap_metrics['mmre']),
    'Bootstrap_MdMRE_Mean': np.mean(bootstrap_metrics['mdmre']),
    'Bootstrap_MdMRE_Std': np.std(bootstrap_metrics['mdmre']),
    'Bootstrap_PRED25_Mean': np.mean(bootstrap_metrics['pred25']),
    'Bootstrap_PRED25_Std': np.std(bootstrap_metrics['pred25'])
}

results_df = pd.DataFrame([results])
results_df.to_csv('cnn_evaluation_results_scaled.csv', index=False)
print("\nĐã lưu kết quả đánh giá vào 'cnn_evaluation_results_scaled.csv'")

# Trực quan hóa kết quả
plt.figure(figsize=(15, 12))

# Loss
plt.subplot(2, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Huber Loss')
plt.legend()

# Predicted vs Actual
plt.subplot(2, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title('Predicted vs Actual Effort (Scaled)')
plt.xlabel('Actual Effort (Scaled)')
plt.ylabel('Predicted Effort (Scaled)')

# Error Distribution
errors = y_test - y_pred
plt.subplot(2, 2, 3)
sns.histplot(errors, kde=True)
plt.title('Error Distribution')
plt.xlabel('Prediction Error (Scaled)')
plt.ylabel('Frequency')

# Bootstrap RMSE
plt.subplot(2, 2, 4)
sns.boxplot(y=np.sqrt(bootstrap_metrics['mse']))
plt.title('Bootstrap RMSE Distribution (Scaled)')
plt.ylabel('RMSE (Scaled)')

plt.tight_layout()
plt.savefig('cnn_visualization_results_scaled.png')
plt.close()
print("\nĐã lưu hình ảnh trực quan hóa vào 'cnn_visualization_results_scaled.png'")

=== Kiểm tra dữ liệu ===
Kích thước: (24, 7)
Các cột: ['Input', 'Output', 'Inquiry', 'File', 'FPAdj', 'RawFPcounts', 'Effort']
Mẫu 5 hàng đầu tiên:
      Input    Output   Inquiry      File     FPAdj  RawFPcounts    Effort
0 -0.544978  2.676655  2.425098  2.235928  0.078752     2.030029  1.905014
1  2.060336  1.583672  2.425098  1.423982  0.078752     2.030029  1.905014
2  1.835512 -0.602296 -1.141891 -0.347538 -1.433293    -0.158757 -0.364603
3  0.248519  0.413717  0.417448 -0.347538  1.212786     0.236318  0.476385
4 -1.338474  0.690812 -1.063924 -0.568978 -0.677270    -0.336098  1.123946

Thông tin dữ liệu:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Input        24 non-null     float64
 1   Output       24 non-null     float64
 2   Inquiry      24 non-null     float64
 3   File         24 non-null     float64
 4   FPAdj        24 non-nul

: 

# MLP



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from uuid import uuid4

# Thiết lập seed để tái lập
np.random.seed(42)
tf.random.set_seed(42)

# Hàm tính các chỉ số đánh giá
def calculate_mape(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def calculate_mmre(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.mean(np.abs(((y_true[mask] - y_pred[mask]) / y_true[mask])))

def calculate_mdmre(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

def calculate_pred25(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    mre = np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])
    return np.mean(mre <= 0.25) * 100

# Đọc dữ liệu đã tiền xử lý
df = pd.read_csv('albrecht_cleaned.csv')

# Kiểm tra dữ liệu
print("=== Kiểm tra dữ liệu ===")
print("Kích thước:", df.shape)
print("Các cột:", df.columns.tolist())
print("Mẫu 5 hàng đầu tiên:")
print(df.head())
print("\nThông tin dữ liệu:")
print(df.info())

# Chọn đặc trưng và biến mục tiêu
features = [col for col in df.columns if col not in ['Project', 'Effort', 'Effort_log']]
X = df[features].values
y = df['Effort'].values  # Sử dụng Effort đã được scale từ file test.csv

# Tăng cường dữ liệu bằng nhiễu Gaussian
def add_gaussian_noise(X, noise_factor=0.05):
    noise = np.random.normal(loc=0, scale=noise_factor, size=X.shape)
    return X + noise

X_augmented = X.copy()
y_augmented = y.copy()
for _ in range(2):  # Tạo thêm 2 bản sao với nhiễu
    X_noisy = add_gaussian_noise(X, noise_factor=0.05)
    X_augmented = np.vstack((X_augmented, X_noisy))
    y_augmented = np.hstack((y_augmented, y))

print("\n=== Sau khi tăng cường dữ liệu bằng nhiễu Gaussian ===")
print("X_augmented shape:", X_augmented.shape)
print("y_augmented shape:", y_augmented.shape)

# Không cần reshape cho MLP vì MLP không yêu cầu dữ liệu 3D
print("\n=== Kích thước dữ liệu sau tăng cường ===")
print("X_augmented shape:", X_augmented.shape)
print("y_augmented shape:", y_augmented.shape)

# Chia tập train/test
X_train, X_test, y_train, y_test = train_test_split(X_augmented, y_augmented, test_size=0.15, random_state=42)

print(f"\n✅ Kích thước dữ liệu MLP:")
print(f" - X_train: {X_train.shape}")
print(f" - X_test : {X_test.shape}")

# Xây dựng mô hình MLP với Dense, BatchNormalization, Dropout
def build_mlp_model(hidden_layers=2, units_per_layer=32, l2_reg=0.01, dropout_rate=0.3, learning_rate=0.001):
    # Đảm bảo l2_reg không âm
    l2_reg = max(l2_reg, 0.001)
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    
    # Thêm các tầng ẩn
    for _ in range(int(hidden_layers)):
        model.add(Dense(int(units_per_layer), activation='relu', kernel_regularizer=l2(l2_reg)))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))
    
    # Tầng đầu ra
    model.add(Dense(1, activation='linear'))
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=['mae'])
    return model

# Không gian siêu tham số cho MLP
param_bounds = {
    'hidden_layers': (1, 4),  # Số tầng ẩn
    'units_per_layer': (16, 128),  # Số đơn vị mỗi tầng
    'l2_reg': (0.001, 0.1),
    'dropout_rate': (0.2, 0.5),
    'learning_rate': (1e-4, 1e-2),
    'batch_size': (16, 64),
    'epochs': (50, 150)
}

# Hàm mã hóa & giải mã particle
def random_particle():
    return np.array([
        np.random.randint(param_bounds['hidden_layers'][0], param_bounds['hidden_layers'][1] + 1),
        np.random.randint(param_bounds['units_per_layer'][0], param_bounds['units_per_layer'][1] + 1),
        np.random.uniform(param_bounds['l2_reg'][0], param_bounds['l2_reg'][1]),
        np.random.uniform(param_bounds['dropout_rate'][0], param_bounds['dropout_rate'][1]),
        np.random.uniform(param_bounds['learning_rate'][0], param_bounds['learning_rate'][1]),
        np.random.randint(param_bounds['batch_size'][0], param_bounds['batch_size'][1] + 1),
        np.random.randint(param_bounds['epochs'][0], param_bounds['epochs'][1] + 1)
    ])

def decode_particle(particle):
    params = {
        'hidden_layers': int(particle[0]),
        'units_per_layer': int(particle[1]),
        'l2_reg': particle[2],
        'dropout_rate': particle[3],
        'learning_rate': particle[4],
        'batch_size': int(particle[5]),
        'epochs': int(particle[6])
    }
    # Đảm bảo l2_reg không âm
    params['l2_reg'] = max(params['l2_reg'], 0.001)
    params['l2_reg'] = min(params['l2_reg'], param_bounds['l2_reg'][1])
    return params

# Hàm fitness cho PSO
def fitness_function(particle):
    params = decode_particle(particle)
    model = build_mlp_model(**{k: v for k, v in params.items() if k != 'batch_size' and k != 'epochs'})
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
        
        model.fit(X_tr, y_tr, epochs=params['epochs'], batch_size=params['batch_size'], 
                  validation_split=0.2, verbose=0, callbacks=[early_stopping, reduce_lr])
        y_pred = model.predict(X_val, verbose=0)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    return np.mean(rmse_scores)

# Triển khai PSO
def run_pso_mlp(num_particles=10, max_iter=10):
    dim = len(param_bounds)
    bounds_array = np.array(list(param_bounds.values()))
    
    particles = [random_particle() for _ in range(num_particles)]
    velocities = [np.zeros(dim) for _ in range(num_particles)]
    
    p_best_positions = particles.copy()
    p_best_scores = [fitness_function(p) for p in particles]
    
    g_best_index = np.argmin(p_best_scores)
    g_best_position = p_best_positions[g_best_index]
    g_best_score = p_best_scores[g_best_index]
    
    w, c1, c2 = 0.5, 1.5, 1.5
    
    for iter in range(max_iter):
        print(f"\n🔁 Iteration {iter + 1}/{max_iter}")
        for i in range(num_particles):
            r1 = np.random.rand(dim)
            r2 = np.random.rand(dim)
            
            velocities[i] = (
                w * velocities[i]
                + c1 * r1 * (p_best_positions[i] - particles[i])
                + c2 * r2 * (g_best_position - particles[i])
            )
            
            particles[i] += velocities[i]
            particles[i] = np.clip(particles[i], bounds_array[:, 0], bounds_array[:, 1])
            # Đảm bảo l2_reg không âm và trong giới hạn
            particles[i][2] = max(particles[i][2], param_bounds['l2_reg'][0])
            particles[i][2] = min(particles[i][2], param_bounds['l2_reg'][1])
            particles[i][3] = np.clip(particles[i][3], param_bounds['dropout_rate'][0], param_bounds['dropout_rate'][1])
            
            score = fitness_function(particles[i])
            
            if score < p_best_scores[i]:
                p_best_scores[i] = score
                p_best_positions[i] = particles[i]
                
            if score < g_best_score:
                g_best_score = score
                g_best_position = particles[i]
                print(f"✅ Cập nhật g_best: Score = {g_best_score:.4f}")
    
    return g_best_position, g_best_score

# Chạy PSO
print("🚀 Chạy PSO để tìm siêu tham số tối ưu...")
best_particle, best_score = run_pso_mlp()
best_params = decode_particle(best_particle)
print(f"🏆 Siêu tham số tốt nhất: {best_params}")
print(f"📉 Score tốt nhất: {best_score:.4f}")

# Huấn luyện mô hình tối ưu
model_optimal = build_mlp_model(**{k: v for k, v in best_params.items() if k != 'batch_size' and k != 'epochs'})
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores_optimal = []
history = None

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\n📂 Fold {fold + 1}/5")
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    history = model_optimal.fit(X_tr, y_tr, epochs=best_params['epochs'], batch_size=best_params['batch_size'], 
                               validation_split=0.2, verbose=0, callbacks=[early_stopping, reduce_lr])
    y_pred = model_optimal.predict(X_val, verbose=0)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores_optimal.append(rmse)
    print(f"✅ Fold {fold + 1} RMSE: {rmse:.4f}")

print(f"\n📊 RMSE trung bình qua 5 folds: {np.mean(rmse_scores_optimal):.4f}")

# Đánh giá trên tập test
y_pred = model_optimal.predict(X_test, verbose=0).flatten()

# Tính các chỉ số đánh giá trên giá trị đã scale
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = calculate_mape(y_test, y_pred)
mmre = calculate_mmre(y_test, y_pred)
mdmre = calculate_mdmre(y_test, y_pred)
pred25 = calculate_pred25(y_test, y_pred)

# Đánh giá bootstrap
n_bootstraps = 500
bootstrap_metrics = {'mse': [], 'mae': [], 'r2': [], 'mape': [], 'mmre': [], 'mdmre': [], 'pred25': []}

for _ in range(n_bootstraps):
    indices = np.random.choice(len(y_test), len(y_test), replace=True)
    y_test_boot = y_test[indices]
    y_pred_boot = y_pred[indices]
    bootstrap_metrics['mse'].append(mean_squared_error(y_test_boot, y_pred_boot))
    bootstrap_metrics['mae'].append(mean_absolute_error(y_test_boot, y_pred_boot))
    bootstrap_metrics['r2'].append(r2_score(y_test_boot, y_pred_boot))
    bootstrap_metrics['mape'].append(calculate_mape(y_test_boot, y_pred_boot))
    bootstrap_metrics['mmre'].append(calculate_mmre(y_test_boot, y_pred_boot))
    bootstrap_metrics['mdmre'].append(calculate_mdmre(y_test_boot, y_pred_boot))
    bootstrap_metrics['pred25'].append(calculate_pred25(y_test_boot, y_pred_boot))

# In kết quả
print("\n📈 Kết quả đánh giá bootstrap (trên giá trị đã scale):")
print(f"📌 MSE     : {np.mean(bootstrap_metrics['mse']):.4f} ± {np.std(bootstrap_metrics['mse']):.4f}")
print(f"📌 RMSE    : {np.mean(np.sqrt(bootstrap_metrics['mse'])):.4f} ± {np.std(np.sqrt(bootstrap_metrics['mse'])):.4f}")
print(f"📌 MAE     : {np.mean(bootstrap_metrics['mae']):.4f} ± {np.std(bootstrap_metrics['mae']):.4f}")
print(f"📌 R²      : {np.mean(bootstrap_metrics['r2']):.4f} ± {np.std(bootstrap_metrics['r2']):.4f}")
print(f"📌 MAPE    : {np.mean(bootstrap_metrics['mape']):.2f}% ± {np.std(bootstrap_metrics['mape']):.2f}%")
print(f"📌 MMRE    : {np.mean(bootstrap_metrics['mmre']):.4f} ± {np.std(bootstrap_metrics['mmre']):.4f}")
print(f"📌 MdMRE   : {np.mean(bootstrap_metrics['mdmre']):.4f} ± {np.std(bootstrap_metrics['mdmre']):.4f}")
print(f"📌 PRED(25): {np.mean(bootstrap_metrics['pred25']):.2f}% ± {np.std(bootstrap_metrics['pred25']):.2f}%")

# Lưu kết quả đánh giá
results = {
    'MSE': mse,
    'RMSE': rmse,
    'MAE': mae,
    'R2': r2,
    'MAPE': mape,
    'MMRE': mmre,
    'MdMRE': mdmre,
    'PRED(25)': pred25,
    'Bootstrap_MSE_Mean': np.mean(bootstrap_metrics['mse']),
    'Bootstrap_MSE_Std': np.std(bootstrap_metrics['mse']),
    'Bootstrap_MAE_Mean': np.mean(bootstrap_metrics['mae']),
    'Bootstrap_MAE_Std': np.std(bootstrap_metrics['mae']),
    'Bootstrap_R2_Mean': np.mean(bootstrap_metrics['r2']),
    'Bootstrap_R2_Std': np.std(bootstrap_metrics['r2']),
    'Bootstrap_MAPE_Mean': np.mean(bootstrap_metrics['mape']),
    'Bootstrap_MAPE_Std': np.std(bootstrap_metrics['mape']),
    'Bootstrap_MMRE_Mean': np.mean(bootstrap_metrics['mmre']),
    'Bootstrap_MMRE_Std': np.std(bootstrap_metrics['mmre']),
    'Bootstrap_MdMRE_Mean': np.mean(bootstrap_metrics['mdmre']),
    'Bootstrap_MdMRE_Std': np.std(bootstrap_metrics['mdmre']),
    'Bootstrap_PRED25_Mean': np.mean(bootstrap_metrics['pred25']),
    'Bootstrap_PRED25_Std': np.std(bootstrap_metrics['pred25'])
}

results_df = pd.DataFrame([results])
results_df.to_csv('mlp_evaluation_results_scaled.csv', index=False)
print("\nĐã lưu kết quả đánh giá vào 'mlp_evaluation_results_scaled.csv'")

# Trực quan hóa kết quả
plt.figure(figsize=(15, 12))

# Loss
plt.subplot(2, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Huber Loss')
plt.legend()

# Predicted vs Actual
plt.subplot(2, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title('Predicted vs Actual Effort (Scaled)')
plt.xlabel('Actual Effort (Scaled)')
plt.ylabel('Predicted Effort (Scaled)')

# Error Distribution
errors = y_test - y_pred
plt.subplot(2, 2, 3)
sns.histplot(errors, kde=True)
plt.title('Error Distribution')
plt.xlabel('Prediction Error (Scaled)')
plt.ylabel('Frequency')

# Bootstrap RMSE
plt.subplot(2, 2, 4)
sns.boxplot(y=np.sqrt(bootstrap_metrics['mse']))
plt.title('Bootstrap RMSE Distribution (Scaled)')
plt.ylabel('RMSE (Scaled)')

plt.tight_layout()
plt.savefig('mlp_visualization_results_scaled.png')
plt.close()
print("\nĐã lưu hình ảnh trực quan hóa vào 'mlp_visualization_results_scaled.png'")

=== Kiểm tra dữ liệu ===
Kích thước: (24, 7)
Các cột: ['Input', 'Output', 'Inquiry', 'File', 'FPAdj', 'RawFPcounts', 'Effort']
Mẫu 5 hàng đầu tiên:
      Input    Output   Inquiry      File     FPAdj  RawFPcounts    Effort
0 -0.544978  2.676655  2.425098  2.235928  0.078752     2.030029  1.905014
1  2.060336  1.583672  2.425098  1.423982  0.078752     2.030029  1.905014
2  1.835512 -0.602296 -1.141891 -0.347538 -1.433293    -0.158757 -0.364603
3  0.248519  0.413717  0.417448 -0.347538  1.212786     0.236318  0.476385
4 -1.338474  0.690812 -1.063924 -0.568978 -0.677270    -0.336098  1.123946

Thông tin dữ liệu:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Input        24 non-null     float64
 1   Output       24 non-null     float64
 2   Inquiry      24 non-null     float64
 3   File         24 non-null     float64
 4   FPAdj        24 non-nul

# LSTM

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import seaborn as sns

# Thiết lập seed để tái lập
np.random.seed(42)
tf.random.set_seed(42)

# Hàm tính các chỉ số đánh giá
def calculate_mape(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def calculate_mmre(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

def calculate_mdmre(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

def calculate_pred25(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    mre = np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])
    return np.mean(mre <= 0.25) * 100

# Tăng cường dữ liệu bằng nhiễu Gaussian
def add_gaussian_noise(X, y, noise_factor=0.01):
    X_noise = X + np.random.normal(loc=0, scale=noise_factor * np.std(X, axis=0), size=X.shape)
    y_noise = y + np.random.normal(loc=0, scale=noise_factor * np.std(y), size=y.shape)
    return X_noise, y_noise

# Đọc dữ liệu
df = pd.read_csv('albrecht_cleaned.csv')

# Xử lý ngoại lai với RobustScaler
feature_scaler = RobustScaler()
target_scaler = RobustScaler()
features = [col for col in df.columns if col != 'Effort']
X = feature_scaler.fit_transform(df[features].values)
y = target_scaler.fit_transform(df[['Effort']].values).flatten()

# Tăng cường dữ liệu
X_augmented = X.copy()
y_augmented = y.copy()
for _ in range(2):  # Tạo 2 bản sao với nhiễu, tăng từ 24 lên 72 mẫu
    X_noise, y_noise = add_gaussian_noise(X, y, noise_factor=0.01)
    X_augmented = np.vstack((X_augmented, X_noise))
    y_augmented = np.hstack((y_augmented, y_noise))

# Reshape dữ liệu cho LSTM: (samples, 1, features)
X_augmented = X_augmented.reshape(X_augmented.shape[0], 1, X_augmented.shape[1])

print("=== Kiểm tra dữ liệu ===")
print("Kích thước ban đầu:", df.shape)
print("Các cột:", df.columns.tolist())
print("Mẫu 5 hàng đầu tiên:")
print(df.head())
print("\n=== Kích thước dữ liệu sau tăng cường và reshape ===")
print("X_augmented shape:", X_augmented.shape)
print("y_augmented shape:", y_augmented.shape)

# Chia tập train/test
X_train, X_test, y_train, y_test = train_test_split(X_augmented, y_augmented, test_size=0.2, random_state=42)

print(f"\n✅ Kích thước dữ liệu LSTM:")
print(f" - X_train: {X_train.shape}")
print(f" - X_test : {X_test.shape}")

# Xây dựng mô hình LSTM đơn giản
def build_lstm_model(units=16, dropout_rate=0.1, learning_rate=0.001):
    model = Sequential([
        Input(shape=(X_train.shape[1], X_train.shape[2])),
        LSTM(units, return_sequences=False),
        Dropout(dropout_rate),
        Dense(units // 2, activation='relu'),
        Dense(1, activation='linear')
    ])
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Không gian siêu tham số
param_bounds = {
    'units': (16, 64),
    'dropout_rate': (0.0, 0.2),
    'learning_rate': (1e-4, 1e-2),
    'batch_size': (4, 16),
    'epochs': (50, 100)
}

# Hàm mã hóa & giải mã particle
def random_particle():
    return np.array([
        np.random.randint(param_bounds['units'][0], param_bounds['units'][1] + 1),
        np.random.uniform(param_bounds['dropout_rate'][0], param_bounds['dropout_rate'][1]),
        np.random.uniform(param_bounds['learning_rate'][0], param_bounds['learning_rate'][1]),
        np.random.randint(param_bounds['batch_size'][0], param_bounds['batch_size'][1] + 1),
        np.random.randint(param_bounds['epochs'][0], param_bounds['epochs'][1] + 1)
    ])

def decode_particle(particle):
    return {
        'units': int(particle[0]),
        'dropout_rate': particle[1],
        'learning_rate': particle[2],
        'batch_size': int(particle[3]),
        'epochs': int(particle[4])
    }

# Hàm fitness cho PSO
def fitness_function(particle):
    params = decode_particle(particle)
    model = build_lstm_model(**{k: v for k, v in params.items() if k != 'batch_size' and k != 'epochs'})
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    pred25_scores = []
    
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
        
        model.fit(X_tr, y_tr, epochs=params['epochs'], batch_size=params['batch_size'], 
                  validation_data=(X_val, y_val), verbose=0, callbacks=[early_stopping, reduce_lr])
        y_pred = model.predict(X_val, verbose=0).flatten()
        pred25 = calculate_pred25(y_val, y_pred)
        pred25_scores.append(pred25)
    
    return -np.mean(pred25_scores)  # Tối ưu hóa PRED(25)

# Triển khai PSO
def run_pso_lstm(num_particles=10, max_iter=15):
    dim = len(param_bounds)
    bounds_array = np.array(list(param_bounds.values()))
    
    particles = [random_particle() for _ in range(num_particles)]
    velocities = [np.zeros(dim) for _ in range(num_particles)]
    
    p_best_positions = particles.copy()
    p_best_scores = [fitness_function(p) for p in particles]
    
    g_best_index = np.argmin(p_best_scores)
    g_best_position = p_best_positions[g_best_index]
    g_best_score = p_best_scores[g_best_index]
    
    w, c1, c2 = 0.7, 1.4, 1.4
    
    for iter in range(max_iter):
        print(f"\n🔁 Iteration {iter + 1}/{max_iter}")
        for i in range(num_particles):
            r1 = np.random.rand(dim)
            r2 = np.random.rand(dim)
            
            velocities[i] = (
                w * velocities[i]
                + c1 * r1 * (p_best_positions[i] - particles[i])
                + c2 * r2 * (g_best_position - particles[i])
            )
            
            particles[i] += velocities[i]
            particles[i] = np.clip(particles[i], bounds_array[:, 0], bounds_array[:, 1])
            
            score = fitness_function(particles[i])
            
            if score < p_best_scores[i]:
                p_best_scores[i] = score
                p_best_positions[i] = particles[i]
                
            if score < g_best_score:
                g_best_score = score
                g_best_position = particles[i]
                print(f"✅ Cập nhật g_best: PRED(25) = {-g_best_score:.4f}%")
    
    return g_best_position, g_best_score

# Chạy PSO
print("🚀 Chạy PSO để tìm siêu tham số tối ưu...")
best_particle, best_score = run_pso_lstm(num_particles=10, max_iter=15)
best_params = decode_particle(best_particle)
print(f"🏆 Siêu tham số tốt nhất: {best_params}")
print(f"📉 PRED(25) tốt nhất: {-best_score:.4f}%")

# Huấn luyện mô hình tối ưu
model_optimal = build_lstm_model(**{k: v for k, v in best_params.items() if k != 'batch_size' and k != 'epochs'})
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
pred25_scores_optimal = []
history = None

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\n📂 Fold {fold + 1}/5")
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    history = model_optimal.fit(X_tr, y_tr, epochs=best_params['epochs'], batch_size=best_params['batch_size'], 
                               validation_data=(X_val, y_val), verbose=0, callbacks=[early_stopping, reduce_lr])
    y_pred = model_optimal.predict(X_val, verbose=0).flatten()
    pred25 = calculate_pred25(y_val, y_pred)
    pred25_scores_optimal.append(pred25)
    print(f"✅ Fold {fold + 1} PRED(25): {pred25:.4f}%")

print(f"\n📊 PRED(25) trung bình qua 5 folds: {np.mean(pred25_scores_optimal):.4f}%")

# Đánh giá trên tập test
y_pred = model_optimal.predict(X_test, verbose=0).flatten()

# Chuyển ngược về thang gốc
y_test_orig = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
y_pred_orig = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()

# Tính các chỉ số đánh giá
mse = mean_squared_error(y_test_orig, y_pred_orig)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_orig, y_pred_orig)
r2 = r2_score(y_test_orig, y_pred_orig)
mape = calculate_mape(y_test_orig, y_pred_orig)
mmre = calculate_mmre(y_test_orig, y_pred_orig)
mdmre = calculate_mdmre(y_test_orig, y_pred_orig)
pred25 = calculate_pred25(y_test_orig, y_pred_orig)

# Đánh giá bootstrap
n_bootstraps = 500
bootstrap_metrics = {'mse': [], 'mae': [], 'r2': [], 'mape': [], 'mmre': [], 'mdmre': [], 'pred25': []}

for _ in range(n_bootstraps):
    indices = np.random.choice(len(y_test_orig), len(y_test_orig), replace=True)
    y_test_boot = y_test_orig[indices]
    y_pred_boot = y_pred_orig[indices]
    bootstrap_metrics['mse'].append(mean_squared_error(y_test_boot, y_pred_boot))
    bootstrap_metrics['mae'].append(mean_absolute_error(y_test_boot, y_pred_boot))
    bootstrap_metrics['r2'].append(r2_score(y_test_boot, y_pred_boot))
    bootstrap_metrics['mape'].append(calculate_mape(y_test_boot, y_pred_boot))
    bootstrap_metrics['mmre'].append(calculate_mmre(y_test_boot, y_pred_boot))
    bootstrap_metrics['mdmre'].append(calculate_mdmre(y_test_boot, y_pred_boot))
    bootstrap_metrics['pred25'].append(calculate_pred25(y_test_boot, y_pred_boot))

# In kết quả
print("\n📈 Kết quả đánh giá bootstrap (thang gốc):")
print(f"📌 MSE     : {np.mean(bootstrap_metrics['mse']):.4f} ± {np.std(bootstrap_metrics['mse']):.4f}")
print(f"📌 RMSE    : {np.mean(np.sqrt(bootstrap_metrics['mse'])):.4f} ± {np.std(np.sqrt(bootstrap_metrics['mse'])):.4f}")
print(f"📌 MAE     : {np.mean(bootstrap_metrics['mae']):.4f} ± {np.std(bootstrap_metrics['mae']):.4f}")
print(f"📌 R²      : {np.mean(bootstrap_metrics['r2']):.4f} ± {np.std(bootstrap_metrics['r2']):.4f}")
print(f"📌 MAPE    : {np.mean(bootstrap_metrics['mape']):.2f}% ± {np.std(bootstrap_metrics['mape']):.2f}%")
print(f"📌 MMRE    : {np.mean(bootstrap_metrics['mmre']):.4f} ± {np.std(bootstrap_metrics['mmre']):.4f}")
print(f"📌 MdMRE   : {np.mean(bootstrap_metrics['mdmre']):.4f} ± {np.std(bootstrap_metrics['mdmre']):.4f}")
print(f"📌 PRED(25): {np.mean(bootstrap_metrics['pred25']):.2f}% ± {np.std(bootstrap_metrics['pred25']):.2f}%")

# Lưu kết quả đánh giá
results = {
    'MSE': mse,
    'RMSE': rmse,
    'MAE': mae,
    'R2 NRC': r2,
    'MAPE': mape,
    'MMRE': mmre,
    'MdMRE': mdmre,
    'PRED(25)': pred25,
    'Bootstrap_MSE_Mean': np.mean(bootstrap_metrics['mse']),
    'Bootstrap_MSE_Std': np.std(bootstrap_metrics['mse']),
    'Bootstrap_MAE_Mean': np.mean(bootstrap_metrics['mae']),
    'Bootstrap_MAE_Std': np.std(bootstrap_metrics['mae']),
    'Bootstrap_R2_Mean': np.mean(bootstrap_metrics['r2']),
    'Bootstrap_R2_Std': np.std(bootstrap_metrics['r2']),
    'Bootstrap_MAPE_Mean': np.mean(bootstrap_metrics['mape']),
    'Bootstrap_MAPE_Std': np.std(bootstrap_metrics['mape']),
    'Bootstrap_MMRE_Mean': np.mean(bootstrap_metrics['mmre']),
    'Bootstrap_MMRE_Std': np.std(bootstrap_metrics['mmre']),
    'Bootstrap_MdMRE_Mean': np.mean(bootstrap_metrics['mdmre']),
    'Bootstrap_MdMRE_Std': np.std(bootstrap_metrics['mdmre']),
    'Bootstrap_PRED25_Mean': np.mean(bootstrap_metrics['pred25']),
    'Bootstrap_PRED25_Std': np.std(bootstrap_metrics['pred25'])
}

results_df = pd.DataFrame([results])
results_df.to_csv('lstm_evaluation_results.csv', index=False)
print("\nĐã lưu kết quả đánh giá vào 'lstm_evaluation_results.csv'")

# Trực quan hóa kết quả
plt.figure(figsize=(15, 12))

# Loss
plt.subplot(2, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()

# Predicted vs Actual
plt.subplot(2, 2, 2)
plt.scatter(y_test_orig, y_pred_orig, alpha=0.5)
plt.plot([y_test_orig.min(), y_test_orig.max()], [y_test_orig.min(), y_test_orig.max()], 'r--')
plt.title('Predicted vs Actual Effort (Original Scale)')
plt.xlabel('Actual Effort')
plt.ylabel('Predicted Effort')

# Error Distribution
errors = y_test_orig - y_pred_orig
plt.subplot(2, 2, 3)
sns.histplot(errors, kde=True)
plt.title('Error Distribution')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')

# Bootstrap PRED(25)
plt.subplot(2, 2, 4)
sns.boxplot(y=bootstrap_metrics['pred25'])
plt.title('Bootstrap PRED(25) Distribution')
plt.ylabel('PRED(25) (%)')

plt.tight_layout()
plt.savefig('lstm_visualization_results.png')
plt.close()
print("\nĐã lưu hình ảnh trực quan hóa vào 'lstm_visualization_results.png'")

=== Kiểm tra dữ liệu ===
Kích thước ban đầu: (24, 7)
Các cột: ['Input', 'Output', 'Inquiry', 'File', 'FPAdj', 'RawFPcounts', 'Effort']
Mẫu 5 hàng đầu tiên:
      Input    Output   Inquiry      File     FPAdj  RawFPcounts    Effort
0 -0.544978  2.676655  2.425098  2.235928  0.078752     2.030029  1.905014
1  2.060336  1.583672  2.425098  1.423982  0.078752     2.030029  1.905014
2  1.835512 -0.602296 -1.141891 -0.347538 -1.433293    -0.158757 -0.364603
3  0.248519  0.413717  0.417448 -0.347538  1.212786     0.236318  0.476385
4 -1.338474  0.690812 -1.063924 -0.568978 -0.677270    -0.336098  1.123946

=== Kích thước dữ liệu sau tăng cường và reshape ===
X_augmented shape: (72, 1, 6)
y_augmented shape: (72,)

✅ Kích thước dữ liệu LSTM:
 - X_train: (57, 1, 6)
 - X_test : (15, 1, 6)
🚀 Chạy PSO để tìm siêu tham số tối ưu...

🔁 Iteration 1/15
✅ Cập nhật g_best: PRED(25) = 77.2727%
✅ Cập nhật g_best: PRED(25) = 80.9091%

🔁 Iteration 2/15
✅ Cập nhật g_best: PRED(25) = 82.8788%

🔁 Iteration 3/15

# RFBN

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Layer
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.cluster import KMeans
from uuid import uuid4

# Thiết lập seed để tái lập
np.random.seed(42)
tf.random.set_seed(42)

# Hàm tính các chỉ số đánh giá
def calculate_mape(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def calculate_mmre(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.mean(np.abs(((y_true[mask] - y_pred[mask]) / y_true[mask])))

def calculate_mdmre(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

def calculate_pred25(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return np.nan
    mre = np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])
    return np.mean(mre <= 0.25) * 100

# Định nghĩa tầng RBF tùy chỉnh
class RBFLayer(Layer):
    def __init__(self, n_centers, centers=None, sigma=1.0, **kwargs):
        super(RBFLayer, self).__init__(**kwargs)
        self.n_centers = n_centers
        self.centers = centers  # Trung tâm được truyền từ KMeans
        self.sigma = sigma

    def build(self, input_shape):
        if self.centers is None:
            # Nếu không có trung tâm, khởi tạo ngẫu nhiên
            self.centers = self.add_weight(name='centers',
                                           shape=(self.n_centers, input_shape[-1]),
                                           initializer='uniform',
                                           trainable=False)
        else:
            # Sử dụng trung tâm từ KMeans
            self.centers = self.add_weight(name='centers',
                                           shape=(self.n_centers, input_shape[-1]),
                                           initializer=tf.keras.initializers.Constant(self.centers),
                                           trainable=False)
        self.sigma = self.add_weight(name='sigma',
                                     shape=(),
                                     initializer=tf.keras.initializers.Constant(self.sigma),
                                     trainable=True)
        super(RBFLayer, self).build(input_shape)

    def call(self, inputs):
        diff = tf.expand_dims(inputs, axis=1) - self.centers  # (samples, n_centers, features)
        l2 = tf.reduce_sum(tf.square(diff), axis=-1)  # (samples, n_centers)
        return tf.exp(-l2 / (2.0 * tf.square(self.sigma)))

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.n_centers)

    def get_config(self):
        config = super(RBFLayer, self).get_config()
        config.update({
            'n_centers': self.n_centers,
            'centers': self.centers.numpy() if isinstance(self.centers, tf.Variable) else self.centers,
            'sigma': self.sigma.numpy() if isinstance(self.sigma, tf.Variable) else self.sigma
        })
        return config

# Đọc dữ liệu đã tiền xử lý
df = pd.read_csv('albrecht_cleaned.csv')

# Kiểm tra dữ liệu
print("=== Kiểm tra dữ liệu ===")
print("Kích thước:", df.shape)
print("Các cột:", df.columns.tolist())
print("Mẫu 5 hàng đầu tiên:")
print(df.head())
print("\nThông tin dữ liệu:")
print(df.info())

# Chọn đặc trưng và biến mục tiêu
features = [col for col in df.columns if col not in ['Project', 'Effort', 'Effort_log']]
X = df[features].values
y = df['Effort'].values  # Sử dụng Effort đã được scale từ file test.csv

# Tăng cường dữ liệu bằng nhiễu Gaussian
def add_gaussian_noise(X, noise_factor=0.05):
    noise = np.random.normal(loc=0, scale=noise_factor, size=X.shape)
    return X + noise

X_augmented = X.copy()
y_augmented = y.copy()
for _ in range(2):  # Tạo thêm 2 bản sao với nhiễu
    X_noisy = add_gaussian_noise(X, noise_factor=0.05)
    X_augmented = np.vstack((X_augmented, X_noisy))
    y_augmented = np.hstack((y_augmented, y))

print("\n=== Sau khi tăng cường dữ liệu bằng nhiễu Gaussian ===")
print("X_augmented shape:", X_augmented.shape)
print("y_augmented shape:", y_augmented.shape)

# Không cần reshape cho RBFN vì dữ liệu là 2D
print("\n=== Kích thước dữ liệu sau tăng cường ===")
print("X_augmented shape:", X_augmented.shape)
print("y_augmented shape:", y_augmented.shape)

# Chia tập train/test
X_train, X_test, y_train, y_test = train_test_split(X_augmented, y_augmented, test_size=0.15, random_state=42)

print(f"\n✅ Kích thước dữ liệu RBFN:")
print(f" - X_train: {X_train.shape}")
print(f" - X_test : {X_test.shape}")

# Xây dựng mô hình RBFN
def build_rbfn_model(n_centers=10, sigma=1.0, l2_reg=0.01, learning_rate=0.001):
    # Đảm bảo l2_reg không âm và sigma dương
    l2_reg = max(l2_reg, 0.001)
    sigma = max(sigma, 0.1)
    
    # Sử dụng KMeans để khởi tạo centers
    kmeans = KMeans(n_clusters=int(n_centers), random_state=42).fit(X_train)
    centers = kmeans.cluster_centers_
    
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        RBFLayer(n_centers, centers=centers, sigma=sigma),
        Dense(1, activation='linear', kernel_regularizer=l2(l2_reg))
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=['mae'])
    return model

# Không gian siêu tham số cho RBFN
param_bounds = {
    'n_centers': (5, 50),  # Số trung tâm (nơ-ron ẩn)
    'sigma': (0.1, 5.0),  # Độ rộng của hàm Gaussian
    'l2_reg': (0.001, 0.1),
    'learning_rate': (1e-4, 1e-2),
    'batch_size': (16, 64),
    'epochs': (50, 150)
}

# Hàm mã hóa & giải mã particle
def random_particle():
    return np.array([
        np.random.randint(param_bounds['n_centers'][0], param_bounds['n_centers'][1] + 1),
        np.random.uniform(param_bounds['sigma'][0], param_bounds['sigma'][1]),
        np.random.uniform(param_bounds['l2_reg'][0], param_bounds['l2_reg'][1]),
        np.random.uniform(param_bounds['learning_rate'][0], param_bounds['learning_rate'][1]),
        np.random.randint(param_bounds['batch_size'][0], param_bounds['batch_size'][1] + 1),
        np.random.randint(param_bounds['epochs'][0], param_bounds['epochs'][1] + 1)
    ])

def decode_particle(particle):
    params = {
        'n_centers': int(particle[0]),
        'sigma': particle[1],
        'l2_reg': particle[2],
        'learning_rate': particle[3],
        'batch_size': int(particle[4]),
        'epochs': int(particle[5])
    }
    # Đảm bảo l2_reg không âm và sigma dương
    params['l2_reg'] = max(params['l2_reg'], 0.001)
    params['l2_reg'] = min(params['l2_reg'], param_bounds['l2_reg'][1])
    params['sigma'] = max(params['sigma'], 0.1)  # Sigma phải dương
    return params

# Hàm fitness cho PSO
def fitness_function(particle):
    params = decode_particle(particle)
    model = build_rbfn_model(**{k: v for k, v in params.items() if k != 'batch_size' and k != 'epochs'})
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
        
        model.fit(X_tr, y_tr, epochs=params['epochs'], batch_size=params['batch_size'], 
                  validation_split=0.2, verbose=0, callbacks=[early_stopping, reduce_lr])
        y_pred = model.predict(X_val, verbose=0)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    return np.mean(rmse_scores)

# Triển khai PSO
def run_pso_rbfn(num_particles=10, max_iter=10):
    dim = len(param_bounds)
    bounds_array = np.array(list(param_bounds.values()))
    
    particles = [random_particle() for _ in range(num_particles)]
    velocities = [np.zeros(dim) for _ in range(num_particles)]
    
    p_best_positions = particles.copy()
    p_best_scores = [fitness_function(p) for p in particles]
    
    g_best_index = np.argmin(p_best_scores)
    g_best_position = p_best_positions[g_best_index]
    g_best_score = p_best_scores[g_best_index]
    
    w, c1, c2 = 0.5, 1.5, 1.5
    
    for iter in range(max_iter):
        print(f"\n🔁 Iteration {iter + 1}/{max_iter}")
        for i in range(num_particles):
            r1 = np.random.rand(dim)
            r2 = np.random.rand(dim)
            
            velocities[i] = (
                w * velocities[i]
                + c1 * r1 * (p_best_positions[i] - particles[i])
                + c2 * r2 * (g_best_position - particles[i])
            )
            
            particles[i] += velocities[i]
            particles[i] = np.clip(particles[i], bounds_array[:, 0], bounds_array[:, 1])
            # Đảm bảo l2_reg không âm và sigma dương
            particles[i][1] = max(particles[i][1], param_bounds['sigma'][0])  # sigma
            particles[i][1] = min(particles[i][1], param_bounds['sigma'][1])
            particles[i][2] = max(particles[i][2], param_bounds['l2_reg'][0])  # l2_reg
            particles[i][2] = min(particles[i][2], param_bounds['l2_reg'][1])
            
            score = fitness_function(particles[i])
            
            if score < p_best_scores[i]:
                p_best_scores[i] = score
                p_best_positions[i] = particles[i]
                
            if score < g_best_score:
                g_best_score = score
                g_best_position = particles[i]
                print(f"✅ Cập nhật g_best: Score = {g_best_score:.4f}")
    
    return g_best_position, g_best_score

# Chạy PSO
print("🚀 Chạy PSO để tìm siêu tham số tối ưu...")
best_particle, best_score = run_pso_rbfn(num_particles=10, max_iter=15)
best_params = decode_particle(best_particle)
print(f"🏆 Siêu tham số tốt nhất: {best_params}")
print(f"📉 Score tốt nhất: {best_score:.4f}")

# Huấn luyện mô hình tối ưu
model_optimal = build_rbfn_model(**{k: v for k, v in best_params.items() if k != 'batch_size' and k != 'epochs'})
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores_optimal = []
history = None

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\n📂 Fold {fold + 1}/5")
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    history = model_optimal.fit(X_tr, y_tr, epochs=best_params['epochs'], batch_size=best_params['batch_size'], 
                               validation_split=0.2, verbose=0, callbacks=[early_stopping, reduce_lr])
    y_pred = model_optimal.predict(X_val, verbose=0)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores_optimal.append(rmse)
    print(f"✅ Fold {fold + 1} RMSE: {rmse:.4f}")

print(f"\n📊 RMSE trung bình qua 5 folds: {np.mean(rmse_scores_optimal):.4f}")

# Đánh giá trên tập test
y_pred = model_optimal.predict(X_test, verbose=0).flatten()

# Tính các chỉ số đánh giá trên giá trị đã scale
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = calculate_mape(y_test, y_pred)
mmre = calculate_mmre(y_test, y_pred)
mdmre = calculate_mdmre(y_test, y_pred)
pred25 = calculate_pred25(y_test, y_pred)

# Đánh giá bootstrap
n_bootstraps = 500
bootstrap_metrics = {'mse': [], 'mae': [], 'r2': [], 'mape': [], 'mmre': [], 'mdmre': [], 'pred25': []}

for _ in range(n_bootstraps):
    indices = np.random.choice(len(y_test), len(y_test), replace=True)
    y_test_boot = y_test[indices]
    y_pred_boot = y_pred[indices]
    bootstrap_metrics['mse'].append(mean_squared_error(y_test_boot, y_pred_boot))
    bootstrap_metrics['mae'].append(mean_absolute_error(y_test_boot, y_pred_boot))
    bootstrap_metrics['r2'].append(r2_score(y_test_boot, y_pred_boot))
    bootstrap_metrics['mape'].append(calculate_mape(y_test_boot, y_pred_boot))
    bootstrap_metrics['mmre'].append(calculate_mmre(y_test_boot, y_pred_boot))
    bootstrap_metrics['mdmre'].append(calculate_mdmre(y_test_boot, y_pred_boot))
    bootstrap_metrics['pred25'].append(calculate_pred25(y_test_boot, y_pred_boot))

# In kết quả
print("\n📈 Kết quả đánh giá bootstrap (trên giá trị đã scale):")
print(f"📌 MSE     : {np.mean(bootstrap_metrics['mse']):.4f} ± {np.std(bootstrap_metrics['mse']):.4f}")
print(f"📌 RMSE    : {np.mean(np.sqrt(bootstrap_metrics['mse'])):.4f} ± {np.std(np.sqrt(bootstrap_metrics['mse'])):.4f}")
print(f"📌 MAE     : {np.mean(bootstrap_metrics['mae']):.4f} ± {np.std(bootstrap_metrics['mae']):.4f}")
print(f"📌 R²      : {np.mean(bootstrap_metrics['r2']):.4f} ± {np.std(bootstrap_metrics['r2']):.4f}")
print(f"📌 MAPE    : {np.mean(bootstrap_metrics['mape']):.2f}% ± {np.std(bootstrap_metrics['mape']):.2f}%")
print(f"📌 MMRE    : {np.mean(bootstrap_metrics['mmre']):.4f} ± {np.std(bootstrap_metrics['mmre']):.4f}")
print(f"📌 MdMRE   : {np.mean(bootstrap_metrics['mdmre']):.4f} ± {np.std(bootstrap_metrics['mdmre']):.4f}")
print(f"📌 PRED(25): {np.mean(bootstrap_metrics['pred25']):.2f}% ± {np.std(bootstrap_metrics['pred25']):.2f}%")

# Lưu kết quả đánh giá
results = {
    'MSE': mse,
    'RMSE': rmse,
    'MAE': mae,
    'R2': r2,
    'MAPE': mape,
    'MMRE': mmre,
    'MdMRE': mdmre,
    'PRED(25)': pred25,
    'Bootstrap_MSE_Mean': np.mean(bootstrap_metrics['mse']),
    'Bootstrap_MSE_Std': np.std(bootstrap_metrics['mse']),
    'Bootstrap_MAE_Mean': np.mean(bootstrap_metrics['mae']),
    'Bootstrap_MAE_Std': np.std(bootstrap_metrics['mae']),
    'Bootstrap_R2_Mean': np.mean(bootstrap_metrics['r2']),
    'Bootstrap_R2_Std': np.std(bootstrap_metrics['r2']),
    'Bootstrap_MAPE_Mean': np.mean(bootstrap_metrics['mape']),
    'Bootstrap_MAPE_Std': np.std(bootstrap_metrics['mape']),
    'Bootstrap_MMRE_Mean': np.mean(bootstrap_metrics['mmre']),
    'Bootstrap_MMRE_Std': np.std(bootstrap_metrics['mmre']),
    'Bootstrap_MdMRE_Mean': np.mean(bootstrap_metrics['mdmre']),
    'Bootstrap_MdMRE_Std': np.std(bootstrap_metrics['mdmre']),
    'Bootstrap_PRED25_Mean': np.mean(bootstrap_metrics['pred25']),
    'Bootstrap_PRED25_Std': np.std(bootstrap_metrics['pred25'])
}

results_df = pd.DataFrame([results])
results_df.to_csv('rbfn_evaluation_results_scaled.csv', index=False)
print("\nĐã lưu kết quả đánh giá vào 'rbfn_evaluation_results_scaled.csv'")

# Trực quan hóa kết quả
plt.figure(figsize=(15, 12))

# Loss
plt.subplot(2, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Huber Loss')
plt.legend()

# Predicted vs Actual
plt.subplot(2, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title('Predicted vs Actual Effort (Scaled)')
plt.xlabel('Actual Effort (Scaled)')
plt.ylabel('Predicted Effort (Scaled)')

# Error Distribution
errors = y_test - y_pred
plt.subplot(2, 2, 3)
sns.histplot(errors, kde=True)
plt.title('Error Distribution')
plt.xlabel('Prediction Error (Scaled)')
plt.ylabel('Frequency')

# Bootstrap RMSE
plt.subplot(2, 2, 4)
sns.boxplot(y=np.sqrt(bootstrap_metrics['mse']))
plt.title('Bootstrap RMSE Distribution (Scaled)')
plt.ylabel('RMSE (Scaled)')

plt.tight_layout()
plt.savefig('rbfn_visualization_results_scaled.png')
plt.close()
print("\nĐã lưu hình ảnh trực quan hóa vào 'rbfn_visualization_results_scaled.png'")

=== Kiểm tra dữ liệu ===
Kích thước: (24, 7)
Các cột: ['Input', 'Output', 'Inquiry', 'File', 'FPAdj', 'RawFPcounts', 'Effort']
Mẫu 5 hàng đầu tiên:
      Input    Output   Inquiry      File     FPAdj  RawFPcounts    Effort
0 -0.544978  2.676655  2.425098  2.235928  0.078752     2.030029  1.905014
1  2.060336  1.583672  2.425098  1.423982  0.078752     2.030029  1.905014
2  1.835512 -0.602296 -1.141891 -0.347538 -1.433293    -0.158757 -0.364603
3  0.248519  0.413717  0.417448 -0.347538  1.212786     0.236318  0.476385
4 -1.338474  0.690812 -1.063924 -0.568978 -0.677270    -0.336098  1.123946

Thông tin dữ liệu:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Input        24 non-null     float64
 1   Output       24 non-null     float64
 2   Inquiry      24 non-null     float64
 3   File         24 non-null     float64
 4   FPAdj        24 non-nul