# 03 - Create Clean Modeling Dataset**Objective**: Create a clean modeling dataset without data leakage## Processing Steps:1. Load data with desc2. Remove all POST-LOAN features (prevent data leakage)3. Remove features with coverage < 80% (quality control)4. Remove METADATA features (except desc - keep for OCEAN extraction)5. create Objectivevariable（Fully Paid vs Charged Off）6. Save Clean Dataset7. Generate quality report

In [None]:
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport warningswarnings.filterwarnings('ignore')# displaypd.set_option('display.max_columns', None)pd.set_option('display.max_rows', 100)print("Libraries loaded successfully！")

## Step 1: Load Data

In [None]:
# Loading raw data print("Loading raw data loan.csv...") df = pd.read_csv('../../data/loan.csv', low_memory=False) print(f"Raw data: {df.shape[0]:,} rows × {df.shape[1]} columns") # Filter data with desc print("\nFilter data with desc...") df_with_desc = df[ df['desc'].notna() & (df['desc'].astype(str).str.strip().str.len() > 1) ].copy() print(f"Data with desc: {df_with_desc.shape[0]:,} rows × {df_with_desc.shape[1]} columns") print(f"Coverage: {len(df_with_desc)/len(df)*100:.2f}%") # Raw data del df print("\nData loading completed！")

## Step 2: Define Feature ClassificationBased on 02_feature_selection_and_leakage_check.ipynb analysis results

In [None]:
# POST-LOAN features (Must delete - causes data leakage)post_loan_features = [# Payment related (generated after loan issued)'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv','total_rec_prncp', 'total_rec_int', 'total_rec_late_fee','recoveries', 'collection_recovery_fee','last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d','last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low',# Hardship and debt settlement (post-loan events)'hardship_flag', 'hardship_type', 'hardship_reason','hardship_status', 'hardship_start_date', 'hardship_end_date','hardship_loan_status', 'hardship_dpd', 'hardship_length','hardship_amount', 'hardship_payoff_balance_amount','deferral_term', 'payment_plan_start_date','debt_settlement_flag', 'debt_settlement_flag_date','settlement_status', 'settlement_date', 'settlement_amount','settlement_percentage', 'settlement_term',# Other post-loan info'pymnt_plan', 'initial_list_status', # policy file date is post-approval'policy_code', # internal policy code]# METADATA features (No predictive value - delete, but keep desc)metadata_features = ['id', 'member_id', 'url', # ID fields'funded_amnt_inv', # duplicate of funded_amnt for investors# Note: 'desc' Keptfor OCEAN extraction]# OUTCOME feature（Objectivevariable - Special handling）outcome_features = ['loan_status', # createObjectivevariable]print(f"POST-LOAN feature count: {len(post_loan_features)}")print(f"METADATA feature count: {len(metadata_features)}")print(f"OUTCOME feature count: {len(outcome_features)}")print(f"\nKept 'desc' field for subsequent OCEAN feature extraction")

## Step 3: analysisfeatureCoverage

In [None]:
# calculateallfeature Coverage
print("calculatefeatureCoverage...\n")

coverage_stats = []

for col in df_with_desc.columns:
    non_null = df_with_desc[col].notna().sum()
    coverage = (non_null / len(df_with_desc)) * 100
    
    # Determine feature type
    if col in post_loan_features:
        feature_type = 'POST-LOAN'
        keep_status = ' DELETE (Leakage)'
    elif col in metadata_features:
        feature_type = 'METADATA'
        keep_status = ' DELETE (No value)'
    elif col in outcome_features:
        feature_type = 'OUTCOME'
        keep_status = 'WARNING: SPECIAL (Target)'
    elif col == 'desc':
        feature_type = 'TEXT'
        keep_status = ' KEEP (For OCEAN)'
    else:
        feature_type = 'PRE-LOAN'
        if coverage >= 80:
            keep_status = ' KEEP (Good quality)'
        else:
            keep_status = f' DELETE (Coverage {coverage:.1f}% < 80%)'
    
    coverage_stats.append({
        'Feature': col,
        'Type': feature_type,
        'Coverage%': f"{coverage:.2f}",
        'Non_Null': f"{non_null:,}",
        'Decision': keep_status,
        'dtype': str(df_with_desc[col].dtype)
    })

coverage_df = pd.DataFrame(coverage_stats)
coverage_df['coverage_numeric'] = coverage_df['Coverage%'].astype(float)
coverage_df = coverage_df.sort_values('coverage_numeric', ascending=False)

print("featureCoverageanalysiscompleted！")
print(f"\nTotal features: {len(coverage_df)}")

# feature
print("\n=" * 80)
print("Feature Type Statistics")
print("=" * 80)
print(coverage_df['Type'].value_counts())

print("\n=" * 80)
print("Processing Decision Statistics")
print("=" * 80)
keep_count = coverage_df['Decision'].str.contains('KEEP').sum()
delete_count = coverage_df['Decision'].str.contains('DELETE').sum()
special_count = coverage_df['Decision'].str.contains('SPECIAL').sum()

print(f"Keptfeature: {keep_count}")
print(f"DELETEfeature: {delete_count}")
print(f"Special handling: {special_count}")

# saveCoverage
coverage_df.to_csv('../../feature_coverage_report.csv', index=False)
print("\nCoverage save: feature_coverage_report.csv")

## Step 4: viewto beDELETE feature

In [None]:
# viewto beDELETE featuredelete_features = coverage_df[coverage_df['Decision'].str.contains('DELETE')]print("=" * 80)print(f"to beDELETE feature (Total {len(delete_features)} )")print("=" * 80)print(delete_features[['Feature', 'Type', 'Coverage%', 'Decision']].to_string(index=False))# DELETEprint("\n=" * 80)print("DELETE analysis")print("=" * 80)leakage_delete = delete_features[delete_features['Type'] == 'POST-LOAN']metadata_delete = delete_features[delete_features['Type'] == 'METADATA']quality_delete = delete_features[(delete_features['Type'] == 'PRE-LOAN') &(delete_features['Coverage%'].astype(float) < 80)]print(f" dataLeakageDELETE (POST-LOAN): {len(leakage_delete)} ")print(f" DELETE (METADATA): {len(metadata_delete)} ")print(f" DELETE (Coverage<80%): {len(quality_delete)} ")

## Step 5: DELETEfeatureandcreatecleandataset

In [None]:
# collectallto beDELETE Featurefeatures_to_delete = delete_features['Feature'].tolist()print(f"startDELETE {len(features_to_delete)} feature...\n")print(f"DELETE Data shape: {df_with_desc.shape}")# DELETEfeaturedf_clean = df_with_desc.drop(columns=features_to_delete, errors='ignore')print(f"DELETE Data shape: {df_clean.shape}")print(f"Keptfeature : {df_clean.shape[1]}")print(f"\nKept feature :")print(f"- desc (for OCEAN extraction)")print(f"- loan_status ( createObjectivevariable)")print(f"- {df_clean.shape[1] - 2} PRE-LOAN feature")

## Step 6: createObjectivevariable

In [None]:
# view loan_statusprint("=" * 80)print("loan_status distribution")print("=" * 80)print(df_clean['loan_status'].value_counts())print(f"\nTotal: {len(df_clean):,} records")# create Objectivevariable# 1 = Charged Off (Charged Off), 0 = Fully Paid (Fully Paid)print("\ncreateObjectivevariable...")# Fully Paidfully_paid_statuses = ['Fully Paid', 'Current', 'In Grace Period']# Charged Offcharged_off_statuses = ['Charged Off', 'Default', 'Late (31-120 days)', 'Late (16-30 days)']# Kept Fully Paid and Charged Offdf_clean_binary = df_clean[(df_clean['loan_status'] == 'Fully Paid') |(df_clean['loan_status'] == 'Charged Off')].copy()print(f"\nData count after filtering: {len(df_clean_binary):,} ")print("\nloan_status distribution ( ):")print(df_clean_binary['loan_status'].value_counts())# createObjectivevariabledf_clean_binary['target'] = (df_clean_binary['loan_status'] == 'Charged Off').astype(int)print("\n=" * 80)print("Objectivevariable ")print("=" * 80)print(f"target = 0 (Fully Paid): {(df_clean_binary['target']==0).sum():,}")print(f"target = 1 (Charged Off): {(df_clean_binary['target']==1).sum():,}")print(f"\nCharged Off : {df_clean_binary['target'].mean()*100:.2f}%")# at DELETE loan_statusdf_clean_binary = df_clean_binary.drop(columns=['loan_status'])print(f"\nFinal data shape: {df_clean_binary.shape}")

## Step 7: dataTypeanalysis

In [None]:
# analysisKeptfeature dataType
print("=" * 80)
print("Keptfeature dataType ")
print("=" * 80)

numeric_features = df_clean_binary.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df_clean_binary.select_dtypes(include=['object']).columns.tolist()

# Remove target and desc
if 'target' in numeric_features:
    numeric_features.remove('target')
if 'desc' in categorical_features:
    categorical_features.remove('desc')

print(f"\nNumeric features: {len(numeric_features)} ")
print(f"Categorical features: {len(categorical_features)} ")
print(f"Text features: 1 (desc)")
print(f"Objectivevariable: 1 (target)")

print("\n=" * 80)
print("Numeric featurescolumns ")
print("=" * 80)
for i, feat in enumerate(numeric_features, 1):
    print(f"{i:3d}. {feat}")

print("\n=" * 80)
print("Categorical featurescolumns ")
print("=" * 80)
for i, feat in enumerate(categorical_features, 1):
    print(f"{i:2d}. {feat}")

## Step 8: Save Clean Dataset

In [None]:
# Save clean modeling dataset output_file = '../../data/loan_clean_for_modeling.csv' print(f"Save Clean Dataset : {output_file}") df_clean_binary.to_csv(output_file, index=False) import os file_size = os.path.getsize(output_file) / (1024 * 1024) # MB print(f"\nFile size: {file_size:.2f} MB") print(f"Data shape: {df_clean_binary.shape[0]:,} rows × {df_clean_binary.shape[1]} columns") # savefeaturecolumns feature_lists = { 'numeric_features': numeric_features, 'categorical_features': categorical_features, 'text_feature': ['desc'], 'target': ['target'] } import json with open('../../feature_lists_clean.json', 'w') as f: json.dump(feature_lists, f, indent=2) print("\nfeaturecolumns save: feature_lists_clean.json") print("\n Data cleaning completed！")

## Step 9: Data Quality Summary Report

In [None]:
print("=" * 80)
print("Data Cleaning Summary Report")
print("=" * 80)

print("\n1. Dataset Size Changes")
print("-" * 80)
print(f"Raw data : 2,260,668 rows × 145 columns")
print(f"Data with desc: {len(df_with_desc):,} rows × {len(df_with_desc.columns)} columns")
print(f"Final modeling data: {df_clean_binary.shape[0]:,} rows × {df_clean_binary.shape[1]} columns")
print(f"dataKept : {len(df_clean_binary)/2260668*100:.2f}%")

print("\n2. featureDELETE ")
print("-" * 80)
print(f"Raw feature count: 145")
print(f"DELETE feature : {len(features_to_delete)}")
print(f" - POST-LOAN (Prevent leakage): {len(leakage_delete)}")
print(f" - METADATA (No value): {len(metadata_delete)}")
print(f" - Low quality (Coverage<80%): {len(quality_delete)}")
print(f"Kept feature : {df_clean_binary.shape[1]}")
print(f" - Numeric: {len(numeric_features)}")
print(f" - Type: {len(categorical_features)}")
print(f" - Text: 1 (desc)")
print(f" - Objectivevariable: 1 (target)")

print("\n3. Objectivevariable ")
print("-" * 80)
print(f"Fully Paid (target=0): {(df_clean_binary['target']==0).sum():,} ({(df_clean_binary['target']==0).sum()/len(df_clean_binary)*100:.2f}%)")
print(f"Charged Off (target=1): {(df_clean_binary['target']==1).sum():,} ({(df_clean_binary['target']==1).sum()/len(df_clean_binary)*100:.2f}%)")
print(f"Charged Off : {df_clean_binary['target'].mean()*100:.2f}%")

print("\n4. Data Quality Check")
print("-" * 80)

# check featuremissing
missing_report = []
for col in df_clean_binary.columns:
    if col not in ['target', 'desc']:
        missing_pct = (df_clean_binary[col].isna().sum() / len(df_clean_binary)) * 100
        if missing_pct > 0:
            missing_report.append({
                'feature': col,
                'Missing_%': f"{missing_pct:.2f}"
            })

if missing_report:
    missing_df = pd.DataFrame(missing_report)
    missing_df['missing_numeric'] = missing_df['Missing_%'].astype(float)
    missing_df = missing_df.sort_values('missing_numeric', ascending=False)
    missing_df = missing_df.drop('missing_numeric', axis=1)
    print(f"Features with missing values: {len(missing_report)} ")
    print("\n 10feature:")
    print(missing_df.head(10).to_string(index=False))
else:
    print("All features have no missing values ")

print("\n5. rows ")
print("-" * 80)
print(" Data cleaning completed， rows :")
print("")
print("1. 04_xgboost_baseline.ipynb")
print(" - Train XGBoost baseline model with clean data")
print(" - Without OCEAN features")
print(" - Establish performance baseline")
print("")
print("2. 05_ocean_feature_extraction.ipynb")
print(" - Extract OCEAN personality features from desc field")
print(" - Note train/test leakage")
print("")
print("3. 06_xgboost_with_ocean.ipynb")
print(" - Train complete model with OCEAN features")
print(" - Compare with baseline model performance")
print("")
print("=" * 80)

## Step 10: Data Overview Visualization

In [None]:
# Create visualization fig, axes = plt.subplots(2, 2, figsize=(15, 12)) # 1. Objectivevariable ax1 = axes[0, 0] target_counts = df_clean_binary['target'].value_counts() colors = ['#2ecc71', '#e74c3c'] ax1.bar(['Fully Paid\n(target=0)', 'Charged Off\n(target=1)'], target_counts.values, color=colors, alpha=0.7, edgecolor='black') ax1.set_ylabel('Count', fontsize=12, fontweight='bold') ax1.set_title('Target Variable Distribution', fontsize=14, fontweight='bold') ax1.grid(axis='y', alpha=0.3) for i, v in enumerate(target_counts.values): ax1.text(i, v, f'{v:,}\n({v/len(df_clean_binary)*100:.1f}%)', ha='center', va='bottom', fontweight='bold') # 2. featureType ax2 = axes[0, 1] feature_type_counts = [len(numeric_features), len(categorical_features), 1] feature_types = ['Numeric', 'Categorical', 'Text'] colors2 = ['#3498db', '#9b59b6', '#f39c12'] wedges, texts, autotexts = ax2.pie(feature_type_counts, labels=feature_types, autopct='%1.1f%%', colors=colors2, startangle=90, textprops={'fontweight': 'bold'}) ax2.set_title('Feature Type Distribution', fontsize=14, fontweight='bold') # 3. featureDELETE ax3 = axes[1, 0] delete_reasons = ['POST-LOAN\n(Leakage)', 'METADATA\n(No value)', 'Low Quality\n(Coverage<80%)', 'Kept'] delete_counts = [len(leakage_delete), len(metadata_delete), len(quality_delete), df_clean_binary.shape[1]] colors3 = ['#e74c3c', '#95a5a6', '#e67e22', '#2ecc71'] bars = ax3.barh(delete_reasons, delete_counts, color=colors3, alpha=0.7, edgecolor='black') ax3.set_xlabel('Count', fontsize=12, fontweight='bold') ax3.set_title('Feature Retention Analysis', fontsize=14, fontweight='bold') ax3.grid(axis='x', alpha=0.3) for i, (bar, count) in enumerate(zip(bars, delete_counts)): ax3.text(count, bar.get_y() + bar.get_height()/2, f' {count}', va='center', fontweight='bold') # 4. Dataset Size Changes ax4 = axes[1, 1] data_stages = ['Original\nDataset', 'With\ndesc', 'Clean\nBinary'] data_counts = [2260668, len(df_with_desc), len(df_clean_binary)] colors4 = ['#34495e', '#3498db', '#2ecc71'] bars = ax4.bar(data_stages, data_counts, color=colors4, alpha=0.7, edgecolor='black') ax4.set_ylabel('Number of Rows', fontsize=12, fontweight='bold') ax4.set_title('Dataset Size Changes', fontsize=14, fontweight='bold') ax4.grid(axis='y', alpha=0.3) for bar, count in zip(bars, data_counts): height = bar.get_height() ax4.text(bar.get_x() + bar.get_width()/2., height, f'{count:,}', ha='center', va='bottom', fontweight='bold') plt.tight_layout() plt.savefig('../../data_cleaning_summary.png', dpi=300, bbox_inches='tight') print("\nVisualization saved: data_cleaning_summary.png") plt.show()