In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Load datasets

In [None]:
loans_test = pd.read_csv("./data/raw_data/loans_test.csv")
loans_train = pd.read_csv("./data/raw_data/loans_train.csv")
loans_valid = pd.read_csv("./data/raw_data/loans_valid.csv")

In [None]:
# check data columns
# Correct dataset mapping
cols_train = set(loans_train.columns)  # Training set
cols_valid = set(loans_valid.columns)  # Validation set
cols_test  = set(loans_test.columns)   # Test set

common_cols = cols_train & cols_valid & cols_test
only_train  = cols_train - common_cols
only_valid  = cols_valid - common_cols
only_test   = cols_test  - common_cols
uncommon    = (cols_train | cols_valid | cols_test) - common_cols

print("Number of common columns:", len(common_cols))
print("Common columns example:", sorted(list(common_cols))[:10], "...")
print("\nColumns only in train:", sorted(list(only_train)))
print("Columns only in valid:", sorted(list(only_valid)))
print("Columns only in test:", sorted(list(only_test)))
print("\nAll uncommon columns:", sorted(list(uncommon)))


### Data type check

In [None]:
# Convert date columns to datetime format

# Convert FirstPaymentDate and MaturityDate
for col in ['FirstPaymentDate', 'MaturityDate']:
    if col in loans_train.columns:
        loans_train[col] = pd.to_datetime(loans_train[col])
        loans_valid[col] = pd.to_datetime(loans_valid[col])
        loans_test[col] = pd.to_datetime(loans_test[col])

# Convert MonthlyReportingPeriod columns (YYYYMM format)
monthly_cols = [col for col in loans_train.columns if 'MonthlyReportingPeriod' in col]
for col in monthly_cols:
    # Convert YYYYMM format to datetime
    loans_train[col] = pd.to_datetime(loans_train[col], format='%Y%m')
    loans_valid[col] = pd.to_datetime(loans_valid[col], format='%Y%m')
    loans_test[col] = pd.to_datetime(loans_test[col], format='%Y%m')

print('converted columns: FirstPaymentDate, MaturityDate, MonthlyReportingPeriod')

### Encode for Object

In [None]:
# Display all object columns sorted by unique values (ascending)
print("=== All Object Columns (Sorted by Unique Values) ===")
object_cols = loans_train.select_dtypes(include=['object']).columns.tolist()
print(f"Found {len(object_cols)} object columns:")
print()

# Create list of columns with their unique counts for sorting
col_info = []
for col in object_cols:
    unique_count = loans_train[col].nunique()
    col_info.append((col, unique_count))

# Sort by unique values (ascending)
col_info.sort(key=lambda x: x[1])

for i, (col, unique_count) in enumerate(col_info, 1):
    print(f"{i}. {col}")
    print(f"   Data type: {loans_train[col].dtype}")
    print(f"   Unique values: {unique_count}")
    print()

In [None]:
# One-Hot Encoding for categorical columns
print("=== One-Hot Encoding for Categorical Columns ===")

# Columns to be one-hot encoded (based on unique values analysis)
one_hot_columns = [
    'PPM_Flag', 'ProductType', 'SuperConformingFlag', 'InterestOnlyFlag',
    'FirstTimeHomebuyerFlag', 'OccupancyStatus', 'Channel', 'LoanPurpose',
    'ProgramIndicator', 'BalloonIndicator', 'PropertyType'
]

print(f"Columns to be one-hot encoded: {len(one_hot_columns)}")
print("Columns:", one_hot_columns)
print()

# Record original shapes
original_train_shape = loans_train.shape
original_valid_shape = loans_valid.shape    
original_test_shape = loans_test.shape

# Apply one-hot encoding (directly overwrite original DataFrames)
for col in one_hot_columns:
    if col in loans_train.columns:
        print(f"Processing {col}...")
        
        # Get all unique values from all datasets (excluding NaN)
        all_values = set()
        for df in [loans_train, loans_valid, loans_test]:
            unique_vals = df[col].dropna().unique()
            all_values.update(unique_vals)
        all_values = sorted(list(all_values))
        
        print(f"  Unique values: {all_values}")
        
        # Create one-hot encoded columns
        for value in all_values:
            new_col_name = f"{col}_{value}"
            loans_train[new_col_name] = (loans_train[col] == value).astype(int)
            loans_valid[new_col_name] = (loans_valid[col] == value).astype(int)
            loans_test[new_col_name] = (loans_test[col] == value).astype(int)
        
        # Drop original column
        loans_train.drop(columns=[col], inplace=True)
        loans_valid.drop(columns=[col], inplace=True)
        loans_test.drop(columns=[col], inplace=True)
        
        print(f"  Created {len(all_values)} one-hot columns")
        print()

print("=== One-Hot Encoding Complete ===")
print(f"Original shape - Train: {original_train_shape}, Valid: {original_valid_shape}, Test: {original_test_shape}")
print(f"New shape - Train: {loans_train.shape}, Valid: {loans_valid.shape}, Test: {loans_test.shape}")
print(f"New features added: {loans_train.shape[1] - original_train_shape[1]}")

### Data vacancy

***Drop columns with 100% missing data***

In [None]:
# Comprehensive analysis of data vacancy

# 1. Overall missing data summary
missing_summary = loans_train.isnull().sum().sort_values(ascending=False)
missing_pct = (missing_summary / len(loans_train)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_summary,
    'Missing_Percentage': missing_pct
})

print(f"Total features: {len(loans_train.columns)}")
print(f"Features with missing data: {(missing_summary > 0).sum()}")
print(f"Features with complete data: {(missing_summary == 0).sum()}")

# Show top 20 features with most missing data
print("Top 20 features with most missing data:")
print(missing_df.head(20).to_string())

# 2. Missing data patterns by feature type
# Categorize features
static_features = [col for col in loans_train.columns if not col.startswith(('0_', '1_', '2_', '3_', '4_', '5_', '6_', '7_', '8_', '9_', '10_', '11_', '12_', '13_')) and col not in ['index', 'target']]
time_series_features = [col for col in loans_train.columns if col.startswith(('0_', '1_', '2_', '3_', '4_', '5_', '6_', '7_', '8_', '9_', '10_', '11_', '12_', '13_'))]

print(f"Static features: {len(static_features)}")
print(f"Time series features: {len(time_series_features)}")
print()

# Analyze missing data by feature type
static_missing = loans_train[static_features].isnull().sum()
time_series_missing = loans_train[time_series_features].isnull().sum()

print("Static features missing data summary:")
print(f"  Features with missing data: {(static_missing > 0).sum()}")
print(f"  Average missing percentage: {static_missing.mean() / len(loans_train) * 100:.2f}%")

print("Time series features missing data summary:")
print(f"  Features with missing data: {(time_series_missing > 0).sum()}")
print(f"  Average missing percentage: {time_series_missing.mean() / len(loans_train) * 100:.2f}%")

In [None]:
# Drop columns with 100% missing data directly
columns_to_drop = ['ReliefRefinanceIndicator', 'PreHARP_Flag']

# Drop columns directly from original datasets
loans_train.drop(columns=columns_to_drop, inplace=True)
loans_valid.drop(columns=columns_to_drop, inplace=True)
loans_test.drop(columns=columns_to_drop, inplace=True)

***process missing data(999)***

CreditScore: Values outside range or missing coded as 9999.

MI_Pct: 999 = not available.

OriginalDTI: Values > 65% or missing coded as 999.

OriginalLTV: Invalid coded as 999.

N_EstimatedLTV: Range 1–998, with 999 = unknown.

In [None]:
# Process missing data (999) - Complete processing for all 999/9999 values

# Define static columns with 999 as missing value indicator
static_columns_with_999 = [
    'CreditScore',    # Values outside range or missing coded as 9999
    'MI_Pct',         # 999 = not available
    'OriginalDTI',    # Values > 65% or missing coded as 999
    'OriginalLTV',    # Invalid coded as 999
]

# Find time series columns with 999 as missing value indicator
# N_EstimatedLTV is a time series variable (N = month index: 0, 1, 2, ...)
time_series_columns_with_999 = [col for col in loans_train.columns if 'EstimatedLTV' in col]

print("1. Static columns with 999/9999 as missing value indicator:")
for col in static_columns_with_999:
    if col in loans_train.columns:
        count_999 = (loans_train[col] == 999).sum()
        count_9999 = (loans_train[col] == 9999).sum() if col == 'CreditScore' else 0
        total_missing = count_999 + count_9999
        print(f"- {col}: {total_missing} missing values (999: {count_999}, 9999: {count_9999})")
    else:
        print(f"- {col}: Column not found in dataset")
print()

print(f"2. Time series columns with 999 as missing value indicator ({len(time_series_columns_with_999)} found):")
for col in time_series_columns_with_999:
    count_999 = (loans_train[col] == 999).sum()
    print(f"   - {col}: {count_999} missing values (999)")
print()

# Process static columns
print("3. Processing static columns:")
for col in static_columns_with_999:
    if col in loans_train.columns:
        # Replace 999 values with NaN
        loans_train[col] = loans_train[col].replace(999, np.nan)
        loans_valid[col] = loans_valid[col].replace(999, np.nan)
        loans_test[col] = loans_test[col].replace(999, np.nan)
        
        # For CreditScore, also replace 9999 values
        if col == 'CreditScore':
            loans_train[col] = loans_train[col].replace(9999, np.nan)
            loans_valid[col] = loans_valid[col].replace(9999, np.nan)
            loans_test[col] = loans_test[col].replace(9999, np.nan)
        
        print(f"   ✓ Processed {col}: Replaced 999/9999 values with NaN")

# Process time series columns
print("\n4. Processing time series columns:")
if len(time_series_columns_with_999) > 0:
    for col in time_series_columns_with_999:
        loans_train[col] = loans_train[col].replace(999, np.nan)
        loans_valid[col] = loans_valid[col].replace(999, np.nan)
        loans_test[col] = loans_test[col].replace(999, np.nan)
        print(f" ✓ Processed {col}: Replaced 999 values with NaN")
else:
    print("   - No time series EstimatedLTV columns found in dataset")


### Distribution Analysis

In [None]:
# Distribution Analysis - Non-Object Type Columns

columns_to_drop = ['index']  # Add columns to exclude from analysis

# Get all columns
all_cols = loans_train.columns.tolist()

# Select only non-object type columns (numerical, datetime, etc.)
non_object_cols = loans_train.select_dtypes(exclude=['object']).columns.tolist()

# Remove columns in the drop list
non_object_cols = [col for col in non_object_cols if col not in columns_to_drop]

print(f"- Total columns: {len(all_cols)}")
print(f"- Non-object type columns: {len(non_object_cols)}")
print(f"- Columns dropped from analysis: {columns_to_drop}")
print(f"- Object type columns (excluded): {len(all_cols) - len(non_object_cols)}")

# Group by data type
data_types = {}
for col in non_object_cols:
    dtype = str(loans_train[col].dtype)
    if dtype not in data_types:
        data_types[dtype] = []
    data_types[dtype].append(col)


# 3. Data range analysis for numerical columns
numerical_cols = loans_train[non_object_cols].select_dtypes(include=[np.number]).columns.tolist()
if len(numerical_cols) > 0:
    print("3. Data Range Analysis for Numerical Columns:")
    numerical_stats = loans_train[numerical_cols].describe()
    
    for col in numerical_cols:
        min_val = numerical_stats.loc['min', col]
        max_val = numerical_stats.loc['max', col]
        mean_val = numerical_stats.loc['mean', col]
        std_val = numerical_stats.loc['std', col]
        
        print(f"- {col}:")
        print(f"  Range: [{min_val:.2f}, {max_val:.2f}]")
        print(f"  Mean: {mean_val:.2f}, Std: {std_val:.2f}")
    print()

# 4. Distribution histograms for numerical columns
if len(numerical_cols) > 0:
    print("4. Distribution Histograms for Numerical Columns:")
    
    # Calculate subplot layout
    n_cols = len(numerical_cols)
    n_rows = (n_cols + 2) // 3  # 3 columns per row
    
    fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
    if n_rows == 1:
        axes = axes.reshape(1, -1)
    
    for i, col in enumerate(numerical_cols):
        row = i // 3
        col_idx = i % 3
        
        # Create histogram
        axes[row, col_idx].hist(loans_train[col].dropna(), bins=30, alpha=0.7, edgecolor='black')
        axes[row, col_idx].set_title(f'{col}\nRange: [{loans_train[col].min():.2f}, {loans_train[col].max():.2f}]')
        axes[row, col_idx].set_xlabel(col)
        axes[row, col_idx].set_ylabel('Frequency')
        axes[row, col_idx].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for i in range(n_cols, n_rows * 3):
        row = i // 3
        col_idx = i % 3
        axes[row, col_idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns found.")

# 5. Summary statistics
print("5. Summary Statistics:")
print(f"- Total non-object columns analyzed: {len(non_object_cols)}")
print(f"- Numerical columns: {len(numerical_cols)}")

# Show basic statistics for numerical columns
if len(numerical_cols) > 0:
    print("\nBasic Statistics for Numerical Columns:")
    print(loans_train[numerical_cols].describe().round(2))
else:
    print("- No numerical columns found for analysis.")