# Data Cleaning: Extract Loans with Description

## Objectives
1. Extract all rows with desc (description) from loan.csv
2. Analyze quality of quantitative variables
3. Save cleaned data as loan_with_desc.csv
4. Generate data quality report

---

In [6]:
# Cell 1: Import libraries and settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print(" Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")

 Libraries imported successfully
Pandas version: 2.0.1
Numpy version: 1.24.3


In [7]:
# Cell 2: Load data 
# 

In [8]:
print("Loading loan.csv...") 
print("(This may take 1-2 minutes, dataset is about 1.1GB)\n") 
df = pd.read_csv('../../data/loan.csv', low_memory=False) 
print("="*80) 
print("Raw Dataset Basic Information") 
print("="*80) 
print(f"Total rows: {len(df):,}") 
print(f"Total columns: {len(df.columns)}") 
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") 
print("\nPreview first 5 rows:") 
df.head()

Loading loan.csv...
(This may take 1-2 minutes, dataset is about 1.1GB)

Raw Dataset Basic Information
Total rows: 2,260,668
Total columns: 145

Memory usage: 5942.30 MB

Preview first 5 rows:


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,Chef,10+ years,RENT,55000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,109xx,NY,18.24,0.0,Apr-2001,1.0,,45.0,9.0,1.0,4341,10.3,34.0,w,2386.02,2386.02,167.02,167.02,113.98,53.04,0.0,0.0,0.0,Feb-2019,84.92,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,16901.0,2.0,2.0,1.0,2.0,2.0,12560.0,69.0,2.0,7.0,2137.0,28.0,42000.0,1.0,11.0,2.0,9.0,1878.0,34360.0,5.9,0.0,0.0,140.0,212.0,1.0,1.0,0.0,1.0,,2.0,,0.0,2.0,5.0,3.0,3.0,16.0,7.0,18.0,5.0,9.0,0.0,0.0,0.0,3.0,100.0,0.0,1.0,0.0,60124.0,16901.0,36500.0,18124.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,Postmaster,10+ years,MORTGAGE,90000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,713xx,LA,26.52,0.0,Jun-1987,0.0,71.0,75.0,13.0,1.0,12315,24.2,44.0,w,29387.75,29387.75,1507.11,1507.11,612.25,894.86,0.0,0.0,0.0,Feb-2019,777.23,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,1208.0,321915.0,4.0,4.0,2.0,3.0,3.0,87153.0,88.0,4.0,5.0,998.0,57.0,50800.0,2.0,15.0,2.0,10.0,24763.0,13761.0,8.3,0.0,0.0,163.0,378.0,4.0,3.0,3.0,4.0,,4.0,,0.0,2.0,4.0,4.0,9.0,27.0,8.0,14.0,4.0,13.0,0.0,0.0,0.0,6.0,95.0,0.0,1.0,0.0,372872.0,99468.0,15000.0,94072.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,Administrative,6 years,MORTGAGE,59280.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,490xx,MI,10.51,0.0,Apr-2011,0.0,,,8.0,0.0,4599,19.1,13.0,w,4787.21,4787.21,353.89,353.89,212.79,141.1,0.0,0.0,0.0,Feb-2019,180.69,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,110299.0,0.0,1.0,0.0,2.0,14.0,7150.0,72.0,0.0,2.0,0.0,35.0,24100.0,1.0,5.0,0.0,4.0,18383.0,13800.0,0.0,0.0,0.0,87.0,92.0,15.0,14.0,2.0,77.0,,14.0,,0.0,0.0,3.0,3.0,3.0,4.0,6.0,7.0,3.0,8.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,136927.0,11749.0,13800.0,10000.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,IT Supervisor,10+ years,MORTGAGE,92000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,985xx,WA,16.74,0.0,Feb-2006,0.0,,,10.0,0.0,5468,78.1,13.0,w,3831.93,3831.93,286.71,286.71,168.07,118.64,0.0,0.0,0.0,Feb-2019,146.51,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,686.0,305049.0,1.0,5.0,3.0,5.0,5.0,30683.0,68.0,0.0,0.0,3761.0,70.0,7000.0,2.0,4.0,3.0,5.0,30505.0,1239.0,75.2,0.0,0.0,62.0,154.0,64.0,5.0,3.0,64.0,,5.0,,0.0,1.0,2.0,1.0,2.0,7.0,2.0,3.0,2.0,10.0,0.0,0.0,0.0,3.0,100.0,100.0,0.0,0.0,385183.0,36151.0,5000.0,44984.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,Mechanic,10+ years,MORTGAGE,57250.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,212xx,MD,26.35,0.0,Dec-2000,0.0,,,12.0,0.0,829,3.6,26.0,w,29339.02,29339.02,1423.21,1423.21,660.98,762.23,0.0,0.0,0.0,Feb-2019,731.78,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,116007.0,3.0,5.0,3.0,5.0,4.0,28845.0,89.0,2.0,4.0,516.0,54.0,23100.0,1.0,0.0,0.0,9.0,9667.0,8471.0,8.9,0.0,0.0,53.0,216.0,2.0,2.0,2.0,2.0,,13.0,,0.0,2.0,2.0,3.0,8.0,9.0,6.0,15.0,2.0,12.0,0.0,0.0,0.0,5.0,92.3,0.0,0.0,0.0,157548.0,29674.0,9300.0,32332.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [None]:
# Cell 3: Filter data with desc
print("="*80)
print("Step 1: Filter data with desc")
print("="*80)

# check desc
print(f"\nNon-null count in desc column: {df['desc'].notna().sum():,}")
print(f"Null count in desc column: {df['desc'].isna().sum():,}")

# ：desc Length > 1
df_with_desc = df[
    df['desc'].notna() &
    (df['desc'].astype(str).str.strip().str.len() > 1)
].copy()

print(f"\n Filtered dataset:")
print(f" - Rows: {len(df_with_desc):,}")
print(f" - Percentage of original dataset: {len(df_with_desc)/len(df)*100:.2f}%")
print(f" - Columns: {len(df_with_desc.columns)} (unchanged)")

# display desc
desc_lengths = df_with_desc['desc'].astype(str).str.len()
print(f"\ndesc text length statistics:")
print(f" - Average length: {desc_lengths.mean():.0f} characters")
print(f" - Median length: {desc_lengths.median():.0f} characters")
print(f" - Min: {desc_lengths.min()} characters")
print(f" - Max: {desc_lengths.max()} characters")

# display Sample
print("\nSample descriptions (first 3):")
print("="*80)
for i, desc in enumerate(df_with_desc['desc'].head(3), 1):
    desc_str = str(desc)
    preview = desc_str[:200] + "..." if len(desc_str) > 200 else desc_str
    print(f"\n[Sample {i}] (Length: {len(desc_str)} characters)")
    print(preview)
print("-"*80)

In [None]:
# Cell 4: Identify and classify all variables
print("="*80)
print("Step 2: Variable classification and cleaning")
print("="*80)

# 1. Initial classification
numeric_cols_raw = df_with_desc.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols_raw = df_with_desc.select_dtypes(include=['object']).columns.tolist()

print(f"\nInitial classification:")
print(f" - Numeric variables: {len(numeric_cols_raw)} ")
print(f" - Categorical variables: {len(categorical_cols_raw)} ")

# 2. characters （ as ）
print(f"\n characters ...")
percent_cols = ['int_rate', 'revol_util', 'sec_app_revol_util']
cleaned_count = 0

for col in percent_cols:
    if col in df_with_desc.columns and df_with_desc[col].dtype == object:
        df_with_desc[col] = pd.to_numeric(
            df_with_desc[col].astype(str).str.strip().str.rstrip('%'),
            errors='coerce'
        )
        cleaned_count += 1
        print(f" {col}: Converted to numeric")

print(f"\nCleaned {cleaned_count} ")

# 3. Categorical variables
print(f"\n Categorical variables ...")
for col in df_with_desc.select_dtypes(include='object').columns:
    df_with_desc[col] = df_with_desc[col].astype(str).str.strip()
print(f" Done")

# 4. Reclassified（ ）
numeric_features = df_with_desc.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df_with_desc.select_dtypes(include=['object']).columns.tolist()

print(f"\nFinal classification:")
print(f" - Numeric variables: {len(numeric_features)} ")
print(f" - Categorical variables: {len(categorical_features)} ")
print(f" - Total: {len(numeric_features) + len(categorical_features)} ")

In [None]:
# Cell 5: Numeric variablesquality analysis
print("="*80)
print("Step 3: Numeric variablesquality analysis")
print("="*80)

# calculate Numeric variables metrics
quality_stats = []

for col in numeric_features:
non_null = df_with_desc[col].notna().sum()
null_count = df_with_desc[col].isna().sum()
coverage = (non_null / len(df_with_desc)) * 100

if non_null > 0:
mean_val = df_with_desc[col].mean()
std_val = df_with_desc[col].std()
min_val = df_with_desc[col].min()
max_val = df_with_desc[col].max()
else:
mean_val = std_val = min_val = max_val = np.nan

quality_stats.append({
'Variable': col,
'Non_Null_Count': non_null,
'Missing_Count': null_count,
'Coverage_%': coverage,
'Mean': mean_val,
'Std_Dev': std_val,
'Min': min_val,
'Max': max_val
})

quality_df = pd.DataFrame(quality_stats)
quality_df = quality_df.sort_values('Coverage_%', ascending=False)

# etc variable
excellent = quality_df[quality_df['Coverage_%'] >= 95]
good = quality_df[(quality_df['Coverage_%'] >= 80) & (quality_df['Coverage_%'] < 95)]
fair = quality_df[(quality_df['Coverage_%'] >= 50) & (quality_df['Coverage_%'] < 80)]
poor = quality_df[quality_df['Coverage_%'] < 50]

print(f"\nData quality classification:")
print(f" Excellent (coverage >= 95%): {len(excellent)} variable")
print(f" Good (80% <= coverage < 95%): {len(good)} variable")
print(f" Fair (50% <= coverage < 80%): {len(fair)} variable")
print(f" Poor (coverage < 50%): {len(poor)} variable")

print(f"\n 20 Numeric variables:")
print("="*80)
display(quality_df.head(20)[['Variable', 'Non_Null_Count', 'Missing_Count', 'Coverage_%', 'Mean', 'Std_Dev']])

In [None]:
# Cell 6: check XGBoost keyfeature
print("="*80)
print("Step 4: XGBoost model key features check")
print("="*80)

# Define key features used in XGBoost model
xgboost_key_features = [
# Basic loan information
"loan_amnt", "int_rate", "installment", "annual_inc", "dti",

# Credit history
"delinq_2yrs", "mths_since_last_delinq", "num_accts_ever_120_pd",

# FICO
"fico_range_low", "fico_range_high",

# Inquiry records
"inq_last_6mths", "mths_since_recent_inq",

# Accounts
"open_acc", "total_acc", "mort_acc", "pub_rec",

# Revolving credit
"revol_bal", "revol_util", "total_rev_hi_lim",

# Installment
"total_bal_il", "il_util", "all_util",

# Other important features
"mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op",
"num_rev_accts", "num_rev_tl_bal_gt_0", "num_actv_rev_tl",
"pct_tl_nvr_dlq", "percent_bc_gt_75",
"pub_rec_bankruptcies", "tax_liens"
]

# Check existence and quality of these features in the dataset
feature_check = []

for feature in xgboost_key_features:
if feature in df_with_desc.columns:
coverage = (df_with_desc[feature].notna().sum() / len(df_with_desc)) * 100
status = " Available" if coverage >= 80 else "WARNING: Low quality"
feature_check.append({
'Feature': feature,
'Exists': '',
'Coverage_%': f"{coverage:.2f}%",
'Status': status
})
else:
feature_check.append({
'Feature': feature,
'Exists': '',
'Coverage_%': 'N/A',
'Status': ' Exists'
})

feature_check_df = pd.DataFrame(feature_check)

available = len(feature_check_df[feature_check_df['Exists'] == ''])
missing = len(feature_check_df[feature_check_df['Exists'] == ''])

print(f"\nXGBoost key features check results:")
print(f" - Availablefeature: {available}/{len(xgboost_key_features)}")
print(f" - Missing features: {missing}/{len(xgboost_key_features)}")

print(f"\nDetailed list:")
display(feature_check_df)

In [None]:
# Cell 7: Categorical variablesanalysis
print("="*80)
print("Step 5: Categorical variablesanalysis")
print("="*80)

# analysis Categorical variables
categorical_stats = []

for col in categorical_features:
non_null = df_with_desc[col].notna().sum()
unique_count = df_with_desc[col].nunique()
coverage = (non_null / len(df_with_desc)) * 100

#
if non_null > 0:
top_value = df_with_desc[col].value_counts().index[0] if unique_count > 0 else 'N/A'
top_count = df_with_desc[col].value_counts().values[0] if unique_count > 0 else 0
else:
top_value = 'N/A'
top_count = 0

categorical_stats.append({
'Variable': col,
'Non_Null_Count': non_null,
'Coverage_%': f"{coverage:.2f}%",
'Unique values': unique_count,
'Top_Value': str(top_value)[:30],
'Top_ValueCount': top_count
})

categorical_df = pd.DataFrame(categorical_stats)

print(f"\nCategorical variables : {len(categorical_features)}")
print(f"\n 20Categorical variables:")
display(categorical_df.head(20))

# XGBoost modelin feature
xgb_categorical = ["term", "grade", "sub_grade", "emp_length", "home_ownership",
"verification_status", "purpose", "application_type"]

print(f"\nXGBoost categorical features details:")
print("="*80)
for col in xgb_categorical:
if col in df_with_desc.columns:
print(f"\n{col}:")
print(f" Unique values: {df_with_desc[col].nunique()}")
print(f" Value distribution:")
value_counts = df_with_desc[col].value_counts().head(5)
for val, count in value_counts.items():
pct = (count / len(df_with_desc)) * 100
print(f" - {val}: {count:,} ({pct:.2f}%)")

In [None]:
# Cell 8: Objectivesvariableanalysis
print("="*80)
print("Step 6: Objectivesvariable (loan_status) analysis")
print("="*80)

if 'loan_status' in df_with_desc.columns:
print(f"\nloan_status distribution:")
status_counts = df_with_desc['loan_status'].value_counts()

for status, count in status_counts.items():
pct = (count / len(df_with_desc)) * 100
print(f" - {status}: {count:,} ({pct:.2f}%)")

# checkis have Fully Paid and Charged Off
if 'Fully Paid' in status_counts.index and 'Charged Off' in status_counts.index:
fully_paid = status_counts['Fully Paid']
charged_off = status_counts['Charged Off']
total_relevant = fully_paid + charged_off

print(f"\n Status:")
print(f" - Fully Paid: {fully_paid:,} ({fully_paid/total_relevant*100:.2f}%)")
print(f" - Charged Off: {charged_off:,} ({charged_off/total_relevant*100:.2f}%)")
print(f" - Total: {total_relevant:,}")
print(f"\nClass imbalance ratio: {fully_paid/charged_off:.2f}:1 (Fully Paid:Charged Off)")
else:
print("\nWARNING: Warning: loan_status Exists")

In [None]:
# Cell 9: Data quality visualization print("="*80) print("Step 7: Data quality visualization") print("="*80) # 1. Numeric variablescoverage （Top 30） fig, axes = plt.subplots(2, 2, figsize=(16, 12)) # 1: Numeric variablescoverage (Top 30) top_30_numeric = quality_df.head(30) axes[0, 0].barh(range(len(top_30_numeric)), top_30_numeric['Coverage_%'], color='steelblue') axes[0, 0].set_yticks(range(len(top_30_numeric))) axes[0, 0].set_yticklabels(top_30_numeric['Variable'], fontsize=8) axes[0, 0].set_xlabel('coverage (%)', fontsize=10) axes[0, 0].set_title('Top 30 Numeric variablescoverage', fontsize=12, fontweight='bold') axes[0, 0].axvline(x=80, color='red', linestyle='--', alpha=0.5, label='80% threshold') axes[0, 0].legend() axes[0, 0].invert_yaxis() # 2: Data quality classification quality_counts = [len(excellent), len(good), len(fair), len(poor)] quality_labels = [ f'Excellent (≥95%)\n{len(excellent)}', f'Good (80-95%)\n{len(good)}', f'Fair (50-80%)\n{len(fair)}', f'Poor (<50%)\n{len(poor)}' ] colors = ['#2ecc71', '#3498db', '#f39c12', '#e74c3c'] axes[0, 1].pie(quality_counts, labels=quality_labels, colors=colors, autopct='%1.1f%%', startangle=90) axes[0, 1].set_title('Numeric variables ', fontsize=12, fontweight='bold') # 3: loan_status distribution（ Exists） if 'loan_status' in df_with_desc.columns: status_counts = df_with_desc['loan_status'].value_counts().head(10) axes[1, 0].bar(range(len(status_counts)), status_counts.values, color='coral') axes[1, 0].set_xticks(range(len(status_counts))) axes[1, 0].set_xticklabels(status_counts.index, rotation=45, ha='right', fontsize=8) axes[1, 0].set_ylabel('Count', fontsize=10) axes[1, 0].set_title('Loan Status Distribution', fontsize=12, fontweight='bold') axes[1, 0].grid(axis='y', alpha=0.3) else: axes[1, 0].text(0.5, 0.5, 'loan_status Exists', ha='center', va='center', fontsize=12) axes[1, 0].set_title('Loan Status Distribution', fontsize=12, fontweight='bold') # 4: desc Length desc_lengths = df_with_desc['desc'].astype(str).str.len() axes[1, 1].hist(desc_lengths, bins=50, color='seagreen', edgecolor='black', alpha=0.7) axes[1, 1].axvline(desc_lengths.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {desc_lengths.mean():.0f}') axes[1, 1].axvline(desc_lengths.median(), color='blue', linestyle='--', linewidth=2, label=f'Median: {desc_lengths.median():.0f}') axes[1, 1].set_xlabel(' Length（characters ）', fontsize=10) axes[1, 1].set_ylabel('Frequency', fontsize=10) axes[1, 1].set_title('desc Length ', fontsize=12, fontweight='bold') axes[1, 1].legend() axes[1, 1].grid(axis='y', alpha=0.3) plt.tight_layout() plt.savefig('../../data_quality_report.png', dpi=300, bbox_inches='tight') print("\n Chart saved as data_quality_report.png") plt.show()

In [None]:
# Cell 10: Save cleaned data print("="*80) print("Step 8: Save cleaned data") print("="*80) output_file = '../../data/loan_with_desc.csv' print(f"\nSaving data to {output_file}...") print(f"( to be1-2 )\n") df_with_desc.to_csv(output_file, index=False) # save import os file_size = os.path.getsize(output_file) / (1024 ** 2) # MB print(f" Data saved successfully!") print(f"\nFile information:") print(f" - Filename: {output_file}") print(f" - File size: {file_size:.2f} MB") print(f" - Rows: {len(df_with_desc):,}") print(f" - Columns: {len(df_with_desc.columns)}") # saveNumeric variables quality_report_file = '../../numeric_features_quality_report.csv' quality_df.to_csv(quality_report_file, index=False) print(f"\n Numeric variables save: {quality_report_file}") # saveCategorical variables categorical_report_file = '../../categorical_features_report.csv' categorical_df.to_csv(categorical_report_file, index=False) print(f" Categorical variables save: {categorical_report_file}")

In [None]:
# Cell 11: Generate final data quality summary report
print("="*80)
print("Final Data Quality Summary Report")
print("="*80)

print(f"\n" + "="*80)
print("1. Dataset Overview")
print("="*80)
print(f"Raw dataset: {len(df):,} × {len(df.columns)} ")
print(f"Cleaned dataset: {len(df_with_desc):,} × {len(df_with_desc.columns)} ")
print(f"Data retention rate: {len(df_with_desc)/len(df)*100:.2f}%")

print(f"\n" + "="*80)
print("2. Variable Statistics")
print("="*80)
print(f"Numeric variables: {len(numeric_features)} ")
print(f" - Excellent (coverage ≥ 95%): {len(excellent)} ")
print(f" - Good (80% ≤ coverage < 95%): {len(good)} ")
print(f" - Fair (50% ≤ coverage < 80%): {len(fair)} ")
print(f" - Poor (coverage < 50%): {len(poor)} ")
print(f"\nCategorical variables: {len(categorical_features)} ")

print(f"\n" + "="*80)
print("3. XGBoost Model Readiness")
print("="*80)
available_xgb_features = feature_check_df[feature_check_df['Exists'] == '']
print(f"key featureAvailable: {len(available_xgb_features)}/{len(xgboost_key_features)}")
print(f"\nRecommended high-quality features for XGBoost:")
high_quality_features = quality_df[quality_df['Coverage_%'] >= 90]['Variable'].tolist()
print(f" - : {len(high_quality_features)} ")
print(f" - 20: {', '.join(high_quality_features[:20])}")

print(f"\n" + "="*80)
print("4. OCEAN Feature Extraction Readiness")
print("="*80)
desc_lengths = df_with_desc['desc'].astype(str).str.len()
print(f" desc Available OCEAN analysis")
print(f" - SampleCount: {len(df_with_desc):,}")
print(f" - Length: {desc_lengths.mean():.0f} characters")
print(f" - Median length: {desc_lengths.median():.0f} characters")
print(f" - Suitable for semantic analysis: ")

print(f"\n" + "="*80)
print("5. Data File Outputs")
print("="*80)
print(f" {output_file} ({file_size:.2f} MB)")
print(f" {quality_report_file}")
print(f" {categorical_report_file}")
print(f" data_quality_report.png")

print(f"\n" + "="*80)
print("6. Next Steps")
print("="*80)
print(" 1. Use loan_with_desc.csv for OCEAN feature extraction")
print(" 2. Merge OCEAN features with existing numeric features")
print(" 3. Train XGBoost model with merged features")
print(" 4. Compare model performance with/without OCEAN features")

print(f"\n" + "="*80)
print("data Done！")
print("="*80)