# 1. Data Loading

In [5]:
import pandas as pd
train_dt = pd.read_csv('cityu10c_train_dataset.csv')
test_dt = pd.read_csv('cityu10c_test_dataset.csv')
train_dt.head()

Unnamed: 0,ID,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,...,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved
0,1,2018-01-01,45,39948,617,Employed,Master,22,13152,48,...,19183,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0
1,2,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,...,9595,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0
2,3,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,...,128874,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0
3,4,2018-01-04,58,69084,545,Employed,High School,34,37898,96,...,5370,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0
4,5,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,...,17286,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1


# 2. Data Pre-processing

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

# Đọc dữ liệu
def load_data(train_path, test_path):
    train_dt = pd.read_csv(train_path)
    test_dt = pd.read_csv(test_path)
    return train_dt, test_dt

train_dt, test_dt = load_data('cityu10c_train_dataset.csv', 'cityu10c_test_dataset.csv')

# Hiển thị thông tin dữ liệu
print(train_dt.info())
print(train_dt.describe())
print(test_dt.info())
print(test_dt.describe())

# Bước 1: Xác định các cột có giá trị thiếu và loại bỏ cột không cần thiết
def preprocess_missing_values(df):
    df = df.copy()
    df.drop(columns=['ApplicationDate', 'ID'], inplace=True, errors='ignore')
    return df

# Bước 2: Xác định các cột phân loại có ít giá trị duy nhất
def get_categorical_candidates(df, threshold=15):
    unique_counts = df.nunique()
    categorical_candidates = unique_counts[unique_counts < threshold].index.tolist()
    return [col for col in categorical_candidates if col != 'ID']

# Xử lý dữ liệu
train_dt = preprocess_missing_values(train_dt)
train_target = train_dt.pop('LoanApproved')
test_dt = preprocess_missing_values(test_dt)

categorical_candidates = get_categorical_candidates(train_dt)

# Transformer cho dữ liệu phân loại
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# Tạo ColumnTransformer để xử lý dữ liệu
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_candidates)
    ], remainder='passthrough'
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          8000 non-null   int64  
 1   ApplicationDate             8000 non-null   object 
 2   Age                         8000 non-null   int64  
 3   AnnualIncome                8000 non-null   int64  
 4   CreditScore                 8000 non-null   int64  
 5   EmploymentStatus            8000 non-null   object 
 6   EducationLevel              8000 non-null   object 
 7   Experience                  8000 non-null   int64  
 8   LoanAmount                  8000 non-null   int64  
 9   LoanDuration                8000 non-null   int64  
 10  MaritalStatus               8000 non-null   object 
 11  NumberOfDependents          8000 non-null   int64  
 12  HomeOwnershipStatus         8000 non-null   object 
 13  MonthlyDebtPayments         8000 

# 3. Model Training

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
import lightgbm as lgb
# Tạo pipeline hoàn chỉnh
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier())
])

# Tách tập huấn luyện và kiểm tra
X_train, X_val, y_train, y_val = train_test_split(train_dt, train_target, test_size=0.2, random_state=42)

# Thiết lập tham số cho GridSearch
param_grid = {
    'classifier__num_leaves': [31],
    'classifier__min_data_in_leaf': [5],
    'classifier__learning_rate': [0.12],
    'classifier__n_estimators': [500],
    'classifier__max_depth': [-1]
}

# Tối ưu mô hình bằng GridSearch
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1, refit=True)
grid_search.fit(X_train, y_train)

# Kiểm tra mô hình tốt nhất
best_model = grid_search.best_estimator_
print("Best parameters found:", grid_search.best_params_)

# Dự đoán trên tập kiểm tra
y_pred = best_model.predict(X_val)

# Đánh giá mô hình
f1 = f1_score(y_val, y_pred)
print(f'Optimized F1 Score: {f1:.4f}')

# Dự đoán trên tập kiểm tra thực tế
test_encoded = best_model.named_steps['preprocessor'].transform(test_dt)
test_predictions = best_model.predict(test_dt)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[LightGBM] [Info] Number of positive: 1550, number of negative: 4850
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4606
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242188 -> initscore=-1.140724
[LightGBM] [Info] Start training from score -1.140724
Best parameters found: {'classifier__learning_rate': 0.12, 'classifier__max_depth': -1, 'classifier__min_data_in_leaf': 5, 'classifier__n_estimators': 500, 'classifier__num_leaves': 31}
Optimized F1 Score: 0.8966




# 4. Model Testing

In [None]:
import joblib
# Kiểm tra xem best_model có phải là pipeline không
if isinstance(best_model, Pipeline):
    print("Model is a valid pipeline, saving...")
    joblib.dump(best_model, 'trained_pipeline.pkl')
    print("Pipeline saved successfully!")
else:
    print("Error! best_model is not a pipeline!")
    print(f"Type of best_model: {type(best_model)}")


✅ Model là pipeline hợp lệ, tiến hành lưu...
🎉 Pipeline đã được lưu thành công!
