In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### load datas

In [54]:
loans_test = pd.read_csv("./data/raw_data/loans_test.csv")
loans_train = pd.read_csv("./data/raw_data/loans_train.csv")
loans_valid = pd.read_csv("./data/raw_data/loans_valid.csv")

In [28]:
# check data columns
# Correct dataset mapping
cols_train = set(loans_train.columns)  # Training set
cols_valid = set(loans_valid.columns)  # Validation set
cols_test  = set(loans_test.columns)   # Test set

common_cols = cols_train & cols_valid & cols_test
only_train  = cols_train - common_cols
only_valid  = cols_valid - common_cols
only_test   = cols_test  - common_cols
uncommon    = (cols_train | cols_valid | cols_test) - common_cols

print("Number of common columns:", len(common_cols))
print("Common columns example:", sorted(list(common_cols))[:10], "...")
print("\nColumns only in train:", sorted(list(only_train)))
print("Columns only in valid:", sorted(list(only_valid)))
print("Columns only in test:", sorted(list(only_test)))
print("\nAll uncommon columns:", sorted(list(uncommon)))


Number of common columns: 143
Common columns example: ['0_CurrentActualUPB', '0_CurrentInterestRate', '0_CurrentNonInterestBearingUPB', '0_EstimatedLTV', '0_InterestBearingUPB', '0_LoanAge', '0_MonthlyReportingPeriod', '0_RemainingMonthsToLegalMaturity', '10_CurrentActualUPB', '10_CurrentInterestRate'] ...

Columns only in train: ['index', 'target']
Columns only in valid: ['index', 'target']
Columns only in test: ['Id']

All uncommon columns: ['Id', 'index', 'target']


### data type

In [58]:
# Convert date columns to datetime format

# Convert FirstPaymentDate and MaturityDate
for col in ['FirstPaymentDate', 'MaturityDate']:
    if col in loans_train.columns:
        print(f"Converting {col}...")
        loans_train[col] = pd.to_datetime(loans_train[col])
        loans_valid[col] = pd.to_datetime(loans_valid[col])
        loans_test[col] = pd.to_datetime(loans_test[col])
        print(f"  ✓ {col} converted to datetime")

# Convert MonthlyReportingPeriod columns (YYYYMM format)
print("\nConverting MonthlyReportingPeriod columns...")
monthly_cols = [col for col in loans_train.columns if 'MonthlyReportingPeriod' in col]
print(f"Found {len(monthly_cols)} MonthlyReportingPeriod columns")

for col in monthly_cols:
    # Convert YYYYMM format to datetime
    loans_train[col] = pd.to_datetime(loans_train[col], format='%Y%m')
    loans_valid[col] = pd.to_datetime(loans_valid[col], format='%Y%m')
    loans_test[col] = pd.to_datetime(loans_test[col], format='%Y%m')

print(f"  ✓ {len(monthly_cols)} MonthlyReportingPeriod columns converted")

# Verify conversions
print("\n=== Verification ===")
print("Date column data types after conversion:")
for col in ['FirstPaymentDate', 'MaturityDate']:
    if col in loans_train.columns:
        print(f"  {col}: {loans_train[col].dtype}")

print(f"\nMonthlyReportingPeriod sample after conversion:")
print(f"  0_MonthlyReportingPeriod: {loans_train['0_MonthlyReportingPeriod'].head().tolist()}")
print(f"  1_MonthlyReportingPeriod: {loans_train['1_MonthlyReportingPeriod'].head().tolist()}")


Converting FirstPaymentDate...
  ✓ FirstPaymentDate converted to datetime
Converting MaturityDate...
  ✓ MaturityDate converted to datetime

Converting MonthlyReportingPeriod columns...
Found 14 MonthlyReportingPeriod columns
  ✓ 14 MonthlyReportingPeriod columns converted

=== Verification ===
Date column data types after conversion:
  FirstPaymentDate: datetime64[ns]
  MaturityDate: datetime64[ns]

MonthlyReportingPeriod sample after conversion:
  0_MonthlyReportingPeriod: [Timestamp('2024-02-01 00:00:00'), Timestamp('2024-02-01 00:00:00'), Timestamp('2024-02-01 00:00:00'), Timestamp('2024-02-01 00:00:00'), Timestamp('2024-02-01 00:00:00')]
  1_MonthlyReportingPeriod: [Timestamp('2024-03-01 00:00:00'), Timestamp('2024-03-01 00:00:00'), Timestamp('2024-03-01 00:00:00'), Timestamp('2024-03-01 00:00:00'), Timestamp('2024-03-01 00:00:00')]


### vacancy and distribution of data

In [56]:
# Comprehensive analysis of data vacancy
print("=== Data Vacancy Analysis ===")
print()

# 1. Overall missing data summary
print("--- Overall Missing Data Summary ---")
missing_summary = loans_train.isnull().sum().sort_values(ascending=False)
missing_pct = (missing_summary / len(loans_train)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_summary,
    'Missing_Percentage': missing_pct
})

print(f"Total features: {len(loans_train.columns)}")
print(f"Features with missing data: {(missing_summary > 0).sum()}")
print(f"Features with complete data: {(missing_summary == 0).sum()}")
print()

# Show top 20 features with most missing data
print("Top 20 features with most missing data:")
print(missing_df.head(20).to_string())
print()

# 2. Missing data patterns by feature type
print("--- Missing Data by Feature Type ---")

# Categorize features
static_features = [col for col in loans_train.columns if not col.startswith(('0_', '1_', '2_', '3_', '4_', '5_', '6_', '7_', '8_', '9_', '10_', '11_', '12_', '13_')) and col not in ['index', 'target']]
time_series_features = [col for col in loans_train.columns if col.startswith(('0_', '1_', '2_', '3_', '4_', '5_', '6_', '7_', '8_', '9_', '10_', '11_', '12_', '13_'))]

print(f"Static features: {len(static_features)}")
print(f"Time series features: {len(time_series_features)}")
print()

# Analyze missing data by feature type
static_missing = loans_train[static_features].isnull().sum()
time_series_missing = loans_train[time_series_features].isnull().sum()

print("Static features missing data summary:")
print(f"  Features with missing data: {(static_missing > 0).sum()}")
print(f"  Average missing percentage: {static_missing.mean() / len(loans_train) * 100:.2f}%")
print()

print("Time series features missing data summary:")
print(f"  Features with missing data: {(time_series_missing > 0).sum()}")
print(f"  Average missing percentage: {time_series_missing.mean() / len(loans_train) * 100:.2f}%")



=== Data Vacancy Analysis ===

--- Overall Missing Data Summary ---
Total features: 145
Features with missing data: 4
Features with complete data: 141

Top 20 features with most missing data:
                          Missing_Count  Missing_Percentage
ReliefRefinanceIndicator          30504          100.000000
PreHARP_Flag                      30504          100.000000
SuperConformingFlag               30176           98.924731
MSA                                3422           11.218201
index                                 0            0.000000
FirstTimeHomebuyerFlag                0            0.000000
target                                0            0.000000
FirstPaymentDate                      0            0.000000
CreditScore                           0            0.000000
OccupancyStatus                       0            0.000000
OriginalCLTV                          0            0.000000
OriginalDTI                           0            0.000000
OriginalUPB                 

In [57]:
# Drop columns with 100% missing data directly
columns_to_drop = ['ReliefRefinanceIndicator', 'PreHARP_Flag']

# Drop columns directly from original datasets
loans_train.drop(columns=columns_to_drop, inplace=True)
loans_valid.drop(columns=columns_to_drop, inplace=True)
loans_test.drop(columns=columns_to_drop, inplace=True)

print(f"Dropped {len(columns_to_drop)} columns: {columns_to_drop}")
print(f"New shapes - Train: {loans_train.shape}, Valid: {loans_valid.shape}, Test: {loans_test.shape}")


Dropped 2 columns: ['ReliefRefinanceIndicator', 'PreHARP_Flag']
New shapes - Train: (30504, 143), Valid: (5370, 143), Test: (13426, 142)
