# Credit Risk Assessment - Exploratory Data Analysis

This notebook performs comprehensive EDA on the German Credit Risk dataset to understand:
- Data structure and quality
- Feature distributions
- Target variable characteristics
- Relationships between features and creditworthiness
- Missing value patterns

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")

## 1. Data Loading and Initial Inspection

In [None]:
# Load the dataset
data = pd.read_csv('kredit.dat', sep='\t', header=None)

# Define feature names based on the problem description
feature_names = [
    'checking_account',      # A11-A14
    'duration_months',       # numerical
    'credit_history',        # A30-A34
    'purpose',              # A40-A410 (incomplete)
    'credit_amount',        # numerical
    'savings_account',      # A61-A65
    'employment_since',     # A71-A75 (incomplete)
    'installment_rate',     # numerical
    'personal_status',      # A91-A95
    'other_debtors',        # A101-A103
    'residence_since',      # numerical
    'property',             # A121-A124
    'age',                  # numerical
    'other_installments',   # A141-A143
    'housing',              # A151-A153
    'existing_credits',     # numerical
    'job',                  # A171-A175 (incomplete)
    'num_dependents',       # numerical
    'telephone',            # A191-A192
    'foreign_worker',       # A201-A202 (incomplete)
    'creditworthy'          # target: 1=yes, 2=no
]

data.columns = feature_names

print(f"Dataset shape: {data.shape}")
print(f"Features: {len(feature_names)-1}")
print(f"Samples: {len(data)}")

In [None]:
# Basic dataset information
print("=== Dataset Info ===")
data.info()
print("\n=== First 5 rows ===")
display(data.head())
print("\n=== Last 5 rows ===")
display(data.tail())

## 2. Missing Value Analysis

In [None]:
# Identify missing values (marked as '?')
missing_analysis = pd.DataFrame({
    'Feature': data.columns,
    'Missing_Count': [sum(data[col] == '?') for col in data.columns],
    'Missing_Percentage': [sum(data[col] == '?') / len(data) * 100 for col in data.columns],
    'Data_Type': data.dtypes
})

missing_analysis = missing_analysis.sort_values('Missing_Count', ascending=False)
print("=== Missing Value Analysis ===")
display(missing_analysis[missing_analysis['Missing_Count'] > 0])

# Visualize missing values
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Missing value counts
missing_features = missing_analysis[missing_analysis['Missing_Count'] > 0]
ax1.bar(missing_features['Feature'], missing_features['Missing_Count'])
ax1.set_title('Missing Value Counts by Feature')
ax1.set_ylabel('Count of Missing Values')
ax1.tick_params(axis='x', rotation=45)

# Missing value percentages
ax2.bar(missing_features['Feature'], missing_features['Missing_Percentage'])
ax2.set_title('Missing Value Percentages by Feature')
ax2.set_ylabel('Percentage (%)')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 3. Target Variable Analysis

In [None]:
# Analyze target variable
target_counts = data['creditworthy'].value_counts().sort_index()
target_percentages = data['creditworthy'].value_counts(normalize=True).sort_index() * 100

print("=== Target Variable Analysis ===")
print(f"Class 1 (Creditworthy): {target_counts[1]} ({target_percentages[1]:.1f}%)")
print(f"Class 2 (Not Creditworthy): {target_counts[2]} ({target_percentages[2]:.1f}%)")
print(f"Class Balance Ratio: {target_counts[1]/target_counts[2]:.2f}:1")

# Visualize target distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Bar plot
labels = ['Creditworthy', 'Not Creditworthy']
colors = ['lightgreen', 'lightcoral']
ax1.bar(labels, target_counts.values, color=colors)
ax1.set_title('Target Variable Distribution')
ax1.set_ylabel('Count')
for i, v in enumerate(target_counts.values):
    ax1.text(i, v + 10, str(v), ha='center', va='bottom', fontweight='bold')

# Pie chart
ax2.pie(target_counts.values, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
ax2.set_title('Target Variable Proportion')

plt.tight_layout()
plt.show()

# Cost implication analysis
print("\n=== Cost Implication ===")
print("Given 5:1 cost ratio (FP:FN):")
print(f"- Misclassifying {target_counts[2]} non-creditworthy as creditworthy = {target_counts[2] * 5} cost units")
print(f"- Misclassifying {target_counts[1]} creditworthy as non-creditworthy = {target_counts[1] * 1} cost units")
print(f"- Total potential cost if all wrong: {target_counts[2] * 5 + target_counts[1] * 1} units")

## 4. Numerical Features Analysis

In [None]:
# Identify numerical features
numerical_features = ['duration_months', 'credit_amount', 'installment_rate', 
                     'residence_since', 'age', 'existing_credits', 'num_dependents']

# Convert to numeric (handling any string values)
for col in numerical_features:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Statistical summary
print("=== Numerical Features Statistical Summary ===")
display(data[numerical_features].describe())

# Visualize distributions
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for i, col in enumerate(numerical_features):
    if i < len(axes):
        axes[i].hist(data[col].dropna(), bins=30, alpha=0.7, color='skyblue', edgecolor='black')
        axes[i].set_title(f'{col.replace("_", " ").title()} Distribution')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)

# Remove empty subplot
if len(numerical_features) < len(axes):
    for i in range(len(numerical_features), len(axes)):
        fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

## 5. Categorical Features Analysis

In [None]:
# Identify categorical features
categorical_features = [col for col in data.columns if col not in numerical_features + ['creditworthy']]

print(f"=== Categorical Features ({len(categorical_features)}) ===")
print(categorical_features)

# Analyze each categorical feature
for feature in categorical_features:
    print(f"\n--- {feature.upper()} ---")
    value_counts = data[feature].value_counts()
    print(f"Unique values: {len(value_counts)}")
    print(value_counts)
    
    # Check for missing values
    missing_count = sum(data[feature] == '?')
    if missing_count > 0:
        print(f"Missing values ('?'): {missing_count} ({missing_count/len(data)*100:.1f}%)")

In [None]:
# Visualize categorical features
n_categorical = len(categorical_features)
n_cols = 3
n_rows = (n_categorical + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
if n_rows == 1:
    axes = axes.reshape(1, -1)
axes = axes.ravel()

for i, feature in enumerate(categorical_features):
    if i < len(axes):
        value_counts = data[feature].value_counts()
        axes[i].bar(range(len(value_counts)), value_counts.values, 
                   color='lightsteelblue', edgecolor='black')
        axes[i].set_title(f'{feature.replace("_", " ").title()}')
        axes[i].set_xlabel('Categories')
        axes[i].set_ylabel('Count')
        axes[i].set_xticks(range(len(value_counts)))
        axes[i].set_xticklabels(value_counts.index, rotation=45, ha='right')
        axes[i].grid(True, alpha=0.3)

# Remove empty subplots
for i in range(n_categorical, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

## 6. Feature-Target Relationships

In [None]:
# Numerical features vs target
print("=== Numerical Features vs Target ===")

fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for i, feature in enumerate(numerical_features):
    if i < len(axes):
        # Box plot
        creditworthy_data = data[data['creditworthy'] == 1][feature].dropna()
        not_creditworthy_data = data[data['creditworthy'] == 2][feature].dropna()
        
        axes[i].boxplot([creditworthy_data, not_creditworthy_data], 
                       labels=['Creditworthy', 'Not Creditworthy'])
        axes[i].set_title(f'{feature.replace("_", " ").title()} by Creditworthiness')
        axes[i].set_ylabel(feature)
        axes[i].grid(True, alpha=0.3)

# Remove empty subplot
if len(numerical_features) < len(axes):
    for i in range(len(numerical_features), len(axes)):
        fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# Statistical tests for numerical features
print("\n=== Statistical Tests (t-test) for Numerical Features ===")
for feature in numerical_features:
    group1 = data[data['creditworthy'] == 1][feature].dropna()
    group2 = data[data['creditworthy'] == 2][feature].dropna()
    
    if len(group1) > 0 and len(group2) > 0:
        statistic, p_value = stats.ttest_ind(group1, group2)
        print(f"{feature}: t-statistic = {statistic:.4f}, p-value = {p_value:.6f}")
        print(f"  Mean Creditworthy: {group1.mean():.2f}")
        print(f"  Mean Not Creditworthy: {group2.mean():.2f}")

In [None]:
# Categorical features vs target
print("=== Categorical Features vs Target ===")

# Create cross-tabulations and visualizations
n_categorical = len(categorical_features)
n_cols = 2
n_rows = (n_categorical + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 6*n_rows))
if n_rows == 1:
    axes = axes.reshape(1, -1)
axes = axes.ravel()

for i, feature in enumerate(categorical_features):
    if i < len(axes):
        # Create crosstab
        crosstab = pd.crosstab(data[feature], data['creditworthy'])
        
        # Normalized crosstab for proportions
        crosstab_norm = pd.crosstab(data[feature], data['creditworthy'], normalize='index') * 100
        
        # Plot stacked bar chart
        crosstab_norm.plot(kind='bar', stacked=True, ax=axes[i], 
                          color=['lightgreen', 'lightcoral'])
        axes[i].set_title(f'{feature.replace("_", " ").title()} vs Creditworthiness')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Percentage')
        axes[i].legend(['Creditworthy', 'Not Creditworthy'])
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].grid(True, alpha=0.3)

# Remove empty subplots
for i in range(n_categorical, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

In [None]:
# Chi-square tests for categorical features
print("\n=== Chi-Square Tests for Categorical Features ===")
from scipy.stats import chi2_contingency

chi_square_results = []

for feature in categorical_features:
    # Create contingency table
    contingency_table = pd.crosstab(data[feature], data['creditworthy'])
    
    # Perform chi-square test
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    
    chi_square_results.append({
        'Feature': feature,
        'Chi2_Statistic': chi2,
        'P_Value': p_value,
        'Degrees_of_Freedom': dof,
        'Significant': 'Yes' if p_value < 0.05 else 'No'
    })
    
    print(f"{feature}: χ² = {chi2:.4f}, p-value = {p_value:.6f}, significant = {'Yes' if p_value < 0.05 else 'No'}")

# Convert to DataFrame for better visualization
chi_square_df = pd.DataFrame(chi_square_results)
chi_square_df = chi_square_df.sort_values('P_Value')
print("\n=== Summary of Chi-Square Tests ===")
display(chi_square_df)

## 7. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
print("=== Correlation Analysis for Numerical Features ===")
numerical_data = data[numerical_features + ['creditworthy']].copy()

# Calculate correlation matrix
correlation_matrix = numerical_data.corr()

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdBu_r', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .5})
plt.title('Correlation Matrix - Numerical Features')
plt.tight_layout()
plt.show()

# Show correlations with target
target_correlations = correlation_matrix['creditworthy'].drop('creditworthy').sort_values(key=abs, ascending=False)
print("\n=== Correlations with Target Variable (sorted by absolute value) ===")
for feature, corr in target_correlations.items():
    print(f"{feature}: {corr:.4f}")

## 8. Advanced Visualizations

In [None]:
# Pairplot for key numerical features
print("=== Pairplot for Key Numerical Features ===")
key_features = ['duration_months', 'credit_amount', 'age', 'creditworthy']
pairplot_data = data[key_features].copy()
pairplot_data['creditworthy'] = pairplot_data['creditworthy'].map({1: 'Creditworthy', 2: 'Not Creditworthy'})

sns.pairplot(pairplot_data, hue='creditworthy', diag_kind='hist', 
             palette={'Creditworthy': 'lightgreen', 'Not Creditworthy': 'lightcoral'})
plt.suptitle('Pairplot of Key Numerical Features by Creditworthiness', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Feature importance based on mutual information
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

print("=== Feature Importance Analysis ===")

# Prepare data for mutual information calculation
X_encoded = data.drop('creditworthy', axis=1).copy()
y = data['creditworthy'].copy()

# Encode categorical variables
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    # Handle missing values by treating them as a separate category
    X_encoded[col] = X_encoded[col].astype(str)
    X_encoded[col] = le.fit_transform(X_encoded[col])
    label_encoders[col] = le

# Calculate mutual information
mi_scores = mutual_info_classif(X_encoded, y, random_state=42)

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Mutual_Information': mi_scores
}).sort_values('Mutual_Information', ascending=False)

# Visualize feature importance
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance.head(15), x='Mutual_Information', y='Feature')
plt.title('Top 15 Features by Mutual Information Score')
plt.xlabel('Mutual Information Score')
plt.tight_layout()
plt.show()

print("\n=== Top 10 Features by Mutual Information ===")
display(feature_importance.head(10))

## 9. Summary and Insights

In [None]:
print("=== EDA SUMMARY AND INSIGHTS ===")
print("\n1. DATASET OVERVIEW:")
print(f"   - Total samples: {len(data)}")
print(f"   - Total features: {len(data.columns) - 1}")
print(f"   - Numerical features: {len(numerical_features)}")
print(f"   - Categorical features: {len(categorical_features)}")

print("\n2. TARGET VARIABLE:")
print(f"   - Creditworthy (1): {target_counts[1]} ({target_percentages[1]:.1f}%)")
print(f"   - Not Creditworthy (2): {target_counts[2]} ({target_percentages[2]:.1f}%)")
print(f"   - Class imbalance ratio: {target_counts[1]/target_counts[2]:.2f}:1")

print("\n3. MISSING VALUES:")
missing_features_summary = missing_analysis[missing_analysis['Missing_Count'] > 0]
if len(missing_features_summary) > 0:
    for _, row in missing_features_summary.iterrows():
        print(f"   - {row['Feature']}: {row['Missing_Count']} ({row['Missing_Percentage']:.1f}%)")
else:
    print("   - No missing values found")

print("\n4. KEY STATISTICAL INSIGHTS:")
print("   Numerical Features:")
for feature in numerical_features:
    group1 = data[data['creditworthy'] == 1][feature].dropna()
    group2 = data[data['creditworthy'] == 2][feature].dropna()
    if len(group1) > 0 and len(group2) > 0:
        diff_pct = ((group1.mean() - group2.mean()) / group2.mean()) * 100
        print(f"   - {feature}: Creditworthy avg = {group1.mean():.2f}, Not creditworthy avg = {group2.mean():.2f} (diff: {diff_pct:+.1f}%)")

print("\n5. MOST IMPORTANT FEATURES (by Mutual Information):")
for i, (_, row) in enumerate(feature_importance.head(5).iterrows()):
    print(f"   {i+1}. {row['Feature']}: {row['Mutual_Information']:.4f}")

print("\n6. PREPROCESSING RECOMMENDATIONS:")
print("   - Handle missing values in 4 features (create 'unknown' category or impute)")
print("   - Consider log transformation for credit_amount (likely skewed)")
print("   - One-hot encode categorical features")
print("   - Apply cost-sensitive learning (5:1 FP:FN cost ratio)")
print("   - Consider feature scaling for distance-based algorithms")
print("   - Address class imbalance if necessary")

print("\n7. MODELING CONSIDERATIONS:")
print("   - Use stratified sampling for train-test split")
print("   - Implement cost-sensitive evaluation metrics")
print("   - Consider ensemble methods for handling feature interactions")
print("   - Validate model performance using cost-weighted metrics")