In [3]:
!pip install xgboost
!pip install tensorflow
# Import lại các thư viện cần thiết
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l2
import joblib
import tensorflow as tf
from scipy.stats import mode

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/fe/df/e3a1f3f008db8d2b199ded2168014c7784b8027714b74d802c892815fd72/xgboost-2.1.2-py3-none-win_amd64.whl.metadata
  Using cached xgboost-2.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 325.1 kB/s eta 0:06:25
   ---------------------------------------- 0.0/124.9 MB 279.3 kB/s eta 0:07:28
   ---------------------------------------- 0.1/124.9 MB 476.3 kB/s eta 0:04:23
   ---------------------------------------- 0.1/124.9 MB 655.8 kB/s eta 0:03:11
   ---------------------------------------- 0.2/124.9 MB 697.2 kB/s eta 0:02:59
   ---------------------------------------- 0.2/124.9 MB 697.2 kB/s eta 0:02:59
   ---------------------

In [4]:
# Load lại bộ dữ liệu
file_path = r'C:\Users\ADMIN\Documents\AWS_2024\Loan.csv'
data = pd.read_csv(file_path)

In [5]:
# Mã hóa các biến phân loại sử dụng LabelEncoder
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}
original_columns = data.columns
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Chia dữ liệu thành phần đặc trưng (X) và nhãn mục tiêu (y)
X = data.drop(columns=['LoanApproved'])
y = data['LoanApproved']

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Chuẩn hóa dữ liệu (Standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Áp dụng PCA để giảm chiều dữ liệu (giữ lại 90% phương sai)
pca = PCA(n_components=0.90)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Áp dụng Random Under Sampling để xử lý mất cân bằng dữ liệu
rus = RandomUnderSampler(random_state=42)
X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train_pca, y_train)

# Kết hợp với Tomek Links để làm sạch dữ liệu
tomek = TomekLinks()
X_resampled, y_resampled = tomek.fit_resample(X_resampled_rus, y_resampled_rus)

# Thiết lập mô hình Logistic Regression, Random Forest và XGBoost để so sánh
rf_model = RandomForestClassifier(random_state=42, max_depth=10, min_samples_leaf=4)

# Thiết lập k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Thiết lập các tham số để tinh chỉnh cho Random Forest
rf_param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

xgb_param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1]
}

# Sử dụng RandomizedSearchCV để tìm kiếm tham số tốt nhất cho Random Forest
rf_random_search = RandomizedSearchCV(rf_model, rf_param_grid, n_iter=10, cv=cv, random_state=42, n_jobs=-1, verbose=2)
rf_random_search.fit(X_resampled, y_resampled)

# Đánh giá mô hình tốt nhất trên tập kiểm tra
print("Random Forest Classification Report:")
y_pred_rf = rf_random_search.best_estimator_.predict(X_test_pca)
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Accuracy Score:", accuracy_score(y_test, y_pred_rf))

# Tạo mô hình LSTM với điều chỉnh để giảm overfitting
X_resampled_lstm = X_resampled.reshape((X_resampled.shape[0], 1, X_resampled.shape[1]))
X_test_lstm = X_test_pca.reshape((X_test_pca.shape[0], 1, X_test_pca.shape[1]))

lstm_model = Sequential()
lstm_model.add(Input(shape=(1, X_resampled.shape[1])))
lstm_model.add(LSTM(8, return_sequences=False, kernel_regularizer=l2(0.1)))  # Giảm số neurons và thêm L2 regularization lớn hơn
lstm_model.add(BatchNormalization())  # Thêm Batch Normalization để ổn định mạng
lstm_model.add(Dropout(0.7))  # Tăng Dropout để giảm overfitting
lstm_model.add(Dense(4, activation='relu', kernel_regularizer=l2(0.1)))  # Thêm lớp Dense với L2 regularization lớn hơn
lstm_model.add(Dense(1, activation='sigmoid'))

# Compile mô hình LSTM
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Huấn luyện mô hình LSTM với số epoch giảm và early stopping nghiêm ngặt hơn
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_lstm_model.keras', save_best_only=True)

lstm_model.fit(X_resampled_lstm, y_resampled, epochs=10, batch_size=16, validation_split=0.2, callbacks=[early_stopping, model_checkpoint])

# Đánh giá trên tập kiểm tra
y_pred_lstm = (lstm_model.predict(X_test_lstm) > 0.5).astype(int)
print("LSTM Classification Report:")
print(classification_report(y_test, y_pred_lstm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lstm))
print("Accuracy Score:", accuracy_score(y_test, y_pred_lstm))

# Lưu các mô hình để sử dụng
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(pca, 'pca.pkl')
joblib.dump(rf_random_search.best_estimator_, 'rf_random_search.best_estimator_.pkl')
lstm_model.save('best_lstm_model.keras')

# Tạo giao diện chỉ nhập các giá trị sau khi áp dụng PCA
inputs = {}
for i in range(X_train_pca.shape[1]):
    inputs[original_columns[i]] = widgets.FloatText(value=0.0, description=original_columns[i])

predict_button = widgets.Button(description='Predict')
output = widgets.Output()

# Định nghĩa hàm dự đoán
def on_predict_clicked(b):
    with output:
        clear_output()
        input_data = np.array([inputs[original_columns[i]].value for i in range(X_train_pca.shape[1])]).reshape(1, -1)

        # Dự đoán với các mô hình
        rf_random_search.best_estimator_ = joblib.load('rf_random_search.best_estimator_.pkl')
        lstm_model = tf.keras.models.load_model('best_lstm_model.keras')

        input_lstm = input_data.reshape((input_data.shape[0], 1, input_data.shape[1]))

        y_pred_input_rf = rf_random_search.best_estimator_.predict(input_data)
        y_pred_input_lstm = lstm_model.predict(input_lstm)
        y_pred_input_lstm_class = (y_pred_input_lstm > 0.5).astype(int)

        # Ensemble kết quả từ các mô hình
        y_input_preds = np.array([y_pred_input_rf, y_pred_input_lstm_class.ravel()])
        y_input_ensemble_pred = mode(y_input_preds, axis=0).mode.ravel()
        result = 'Approved' if y_input_ensemble_pred[0] == 1 else 'Not Approved'

        print(f'Ensemble Model Prediction: {result}')

predict_button.on_click(on_predict_clicked)

# Hiển thị giao diện
input_widgets = widgets.VBox(list(inputs.values()) + [predict_button, output])
display(input_widgets)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      2983
           1       0.84      0.94      0.89      1017

    accuracy                           0.94      4000
   macro avg       0.91      0.94      0.92      4000
weighted avg       0.94      0.94      0.94      4000

Confusion Matrix:
[[2802  181]
 [  58  959]]
Accuracy Score: 0.94025
Epoch 1/10
[1m369/369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5988 - loss: 2.8058 - val_accuracy: 0.6748 - val_loss: 0.9825
Epoch 2/10
[1m369/369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8388 - loss: 0.6975 - val_accuracy: 0.9512 - val_loss: 0.4789
Epoch 3/10
[1m369/369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8941 - loss: 0.4098 - val_accuracy: 0.9492 - val_loss: 0.3402
Epoch 4/10
[1m369/3

VBox(children=(FloatText(value=0.0, description='ApplicationDate'), FloatText(value=0.0, description='Age'), F…

In [6]:
# Hiển thị thông tin mã hóa từ LabelEncoder
for col in categorical_columns:
    print(f"Mã hóa cho cột '{col}':")
    for index, class_label in enumerate(label_encoders[col].classes_):
        print(f"  {class_label} -> {index}")

Mã hóa cho cột 'ApplicationDate':
  2018-01-01 -> 0
  2018-01-02 -> 1
  2018-01-03 -> 2
  2018-01-04 -> 3
  2018-01-05 -> 4
  2018-01-06 -> 5
  2018-01-07 -> 6
  2018-01-08 -> 7
  2018-01-09 -> 8
  2018-01-10 -> 9
  2018-01-11 -> 10
  2018-01-12 -> 11
  2018-01-13 -> 12
  2018-01-14 -> 13
  2018-01-15 -> 14
  2018-01-16 -> 15
  2018-01-17 -> 16
  2018-01-18 -> 17
  2018-01-19 -> 18
  2018-01-20 -> 19
  2018-01-21 -> 20
  2018-01-22 -> 21
  2018-01-23 -> 22
  2018-01-24 -> 23
  2018-01-25 -> 24
  2018-01-26 -> 25
  2018-01-27 -> 26
  2018-01-28 -> 27
  2018-01-29 -> 28
  2018-01-30 -> 29
  2018-01-31 -> 30
  2018-02-01 -> 31
  2018-02-02 -> 32
  2018-02-03 -> 33
  2018-02-04 -> 34
  2018-02-05 -> 35
  2018-02-06 -> 36
  2018-02-07 -> 37
  2018-02-08 -> 38
  2018-02-09 -> 39
  2018-02-10 -> 40
  2018-02-11 -> 41
  2018-02-12 -> 42
  2018-02-13 -> 43
  2018-02-14 -> 44
  2018-02-15 -> 45
  2018-02-16 -> 46
  2018-02-17 -> 47
  2018-02-18 -> 48
  2018-02-19 -> 49
  2018-02-20 -> 50
  2018-

In [7]:
import joblib
import json

# Lưu scaler, PCA, và mô hình
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(pca, 'pca.pkl')
joblib.dump(rf_random_search.best_estimator_, 'rf_model.pkl')
lstm_model.save('lstm_model.keras')  # Sử dụng định dạng hiện đại thay vì HDF5

# Lưu thông tin LabelEncoder
label_encoder_mapping = {}
for col, le in label_encoders.items():
    label_encoder_mapping[col] = {str(key): int(value) for key, value in zip(le.classes_, le.transform(le.classes_))}

with open('label_encoders.json', 'w') as f:
    json.dump(label_encoder_mapping, f)

# Lưu thông tin về đặc trưng
with open('feature_info.json', 'w') as f:
    json.dump(list(X.columns), f)

print("Lưu tất cả thông tin thành công!")

Lưu tất cả thông tin thành công!


In [8]:
!pip install gradio
import gradio as gr
import numpy as np
import joblib
import json
import tensorflow as tf

# Load các mô hình và scaler
scaler = joblib.load('scaler.pkl')
pca = joblib.load('pca.pkl')
rf_model = joblib.load('rf_model.pkl')
lstm_model = tf.keras.models.load_model('lstm_model.keras', compile=False)

with open('label_encoders.json', 'r') as f:
    label_encoder_mapping = json.load(f)

with open('feature_info.json', 'r') as f:
    features = json.load(f)

# Hàm dự đoán
def predict(*inputs):
    input_data = []
    for value, feature in zip(inputs, features):
        if feature in label_encoder_mapping:
            value = label_encoder_mapping[feature][value]  # Mã hóa giá trị phân loại
        input_data.append(value)

    input_data = np.array(input_data).reshape(1, -1)
    input_scaled = scaler.transform(input_data)
    input_pca = pca.transform(input_scaled)
    input_lstm = input_pca.reshape((input_pca.shape[0], 1, input_pca.shape[1]))

    # Dự đoán
    pred_rf = rf_model.predict(input_pca)[0]
    pred_lstm = (lstm_model.predict(input_lstm) > 0.5).astype(int)[0][0]

    # Ensemble
    final_prediction = "Có khả năng hoàn trả" if (pred_rf + pred_lstm) >= 1 else "Không có khả năng hoàn trả"
    return final_prediction

# Tạo giao diện với gradio
inputs = [
    gr.Dropdown(choices=list(label_encoder_mapping[feature].keys()), label=feature) if feature in label_encoder_mapping else gr.Number(label=feature)
    for feature in features
]

interface = gr.Interface(fn=predict, inputs=inputs, outputs="text", title="Dự đoán khả năng hoàn trả")
interface.launch()

Collecting gradio
  Obtaining dependency information for gradio from https://files.pythonhosted.org/packages/19/c6/bfab4a8c332e7c1f0c6c73213335a4980b516508a3900ce34a5d0400ae0b/gradio-5.6.0-py3-none-any.whl.metadata
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Obtaining dependency information for aiofiles<24.0,>=22.0 from https://files.pythonhosted.org/packages/c5/19/5af6804c4cc0fed83f47bff6e413a98a36618e7d40185cd36e69737f3b0e/aiofiles-23.2.1-py3-none-any.whl.metadata
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Obtaining dependency information for fastapi<1.0,>=0.115.2 from https://files.pythonhosted.org/packages/54/c4/148d5046a96c428464557264877ae5a9338a83bbe0df045088749ec89820/fastapi-0.115.5-py3-none-any.whl.metadata
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Obtaining dependency information for ffm

