# Exploratory Data Analysis - Nepal Earthquake Building Damage Data

This notebook performs an exploratory data analysis on the Nepal earthquake building damage dataset. The main objective is to predict `damage_grade` based on building characteristics.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Set plotting style (try different style options for compatibility)
try:
    plt.style.use('seaborn-v0_8')
except:
    try:
        plt.style.use('seaborn')
    except:
        plt.style.use('default')
sns.set_palette("husl")


## 1. Load the Data


In [None]:
# Load the dataset
df = pd.read_csv('nepal_dat.csv')

print(f"Dataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]}")
print("\nFirst few rows:")
df.head()


## 2. Data Types and Variable Classification


In [None]:
# Get data types
print("Data Types Summary:")
print("=" * 60)
print(df.dtypes.value_counts())
print("\n" + "=" * 60)

# Create a summary dataframe
data_types_summary = pd.DataFrame({
    'Column': df.columns,
    'Data_Type': df.dtypes,
    'Is_Numeric': df.dtypes.apply(lambda x: pd.api.types.is_numeric_dtype(x)),
    'Is_String': df.dtypes.apply(lambda x: pd.api.types.is_string_dtype(x) or pd.api.types.is_object_dtype(x))
})

# Classify variables
data_types_summary['Variable_Type'] = data_types_summary.apply(
    lambda row: 'Numeric' if row['Is_Numeric'] else 'String/Categorical',
    axis=1
)

print("\nDetailed Data Types by Column:")
print("=" * 80)
print(data_types_summary.to_string(index=False))


In [None]:
# Summary counts
print("\n" + "=" * 60)
print("Variable Type Summary:")
print("=" * 60)
print(f"Total Numeric Variables: {data_types_summary['Is_Numeric'].sum()}")
print(f"Total String/Categorical Variables: {data_types_summary['Is_String'].sum()}")
print("\nNumeric Variables:")
numeric_cols = data_types_summary[data_types_summary['Is_Numeric']]['Column'].tolist()
print(numeric_cols)
print("\nString/Categorical Variables:")
string_cols = data_types_summary[data_types_summary['Is_String']]['Column'].tolist()
print(string_cols)


## 3. Missing Values (NA Count) per Column


In [None]:
# Calculate missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'NA_Count': df.isna().sum(),
    'NA_Percentage': (df.isna().sum() / len(df)) * 100
})

# Sort by NA count (descending)
missing_data = missing_data.sort_values('NA_Count', ascending=False)

print("Missing Values Summary:")
print("=" * 80)
print(missing_data.to_string(index=False))

# Summary statistics
print("\n" + "=" * 60)
print("Missing Values Summary Statistics:")
print("=" * 60)
print(f"Total columns with missing values: {(missing_data['NA_Count'] > 0).sum()}")
print(f"Total missing values in dataset: {missing_data['NA_Count'].sum():,}")
print(f"Percentage of dataset that is missing: {(missing_data['NA_Count'].sum() / (df.shape[0] * df.shape[1])) * 100:.2f}%")


In [None]:
# Visualize missing values
if missing_data['NA_Count'].sum() > 0:
    # Filter to only columns with missing values
    missing_with_na = missing_data[missing_data['NA_Count'] > 0]
    
    if len(missing_with_na) > 0:
        plt.figure(figsize=(12, max(6, len(missing_with_na) * 0.3)))
        plt.barh(missing_with_na['Column'], missing_with_na['NA_Count'])
        plt.xlabel('Number of Missing Values')
        plt.title('Missing Values by Column')
        plt.tight_layout()
        plt.show()
    else:
        print("No missing values found in the dataset!")
else:
    print("No missing values found in the dataset!")


## 4. Target Variable Analysis: damage_grade

Since `damage_grade` is the prediction objective, let's examine it in detail.


In [None]:
# Check if damage_grade exists
if 'damage_grade' in df.columns:
    print("=" * 60)
    print("DAMAGE_GRADE ANALYSIS")
    print("=" * 60)
    
    print(f"\nData type: {df['damage_grade'].dtype}")
    print(f"Missing values: {df['damage_grade'].isna().sum()}")
    print(f"Unique values: {df['damage_grade'].nunique()}")
    print(f"Unique value list: {sorted(df['damage_grade'].unique())}")
    
    print("\nValue Counts:")
    print(df['damage_grade'].value_counts().sort_index())
    
    print("\nValue Counts (Percentages):")
    print((df['damage_grade'].value_counts(normalize=True).sort_index() * 100).round(2))
    
    print("\nBasic Statistics:")
    print(df['damage_grade'].describe())
else:
    print("WARNING: 'damage_grade' column not found in the dataset!")


In [None]:
# Visualize damage_grade distribution
if 'damage_grade' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar plot
    damage_counts = df['damage_grade'].value_counts().sort_index()
    axes[0].bar(damage_counts.index, damage_counts.values, color='steelblue', alpha=0.7)
    axes[0].set_xlabel('Damage Grade')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution of Damage Grade (Count)')
    axes[0].grid(axis='y', alpha=0.3)
    
    # Add count labels on bars
    for i, v in enumerate(damage_counts.values):
        axes[0].text(damage_counts.index[i], v, f'{v:,}', 
                    ha='center', va='bottom', fontsize=9)
    
    # Pie chart
    damage_pct = df['damage_grade'].value_counts(normalize=True).sort_index() * 100
    axes[1].pie(damage_pct.values, labels=[f'Grade {idx}' for idx in damage_pct.index], 
                autopct='%1.1f%%', startangle=90)
    axes[1].set_title('Distribution of Damage Grade (Percentage)')
    
    plt.tight_layout()
    plt.show()


## 5. Numeric Variables Summary Statistics


In [None]:
# Summary statistics for numeric variables
if len(numeric_cols) > 0:
    print("Summary Statistics for Numeric Variables:")
    print("=" * 100)
    numeric_summary = df[numeric_cols].describe()
    print(numeric_summary)
    
    # Additional statistics
    print("\n" + "=" * 100)
    print("Additional Statistics:")
    print("=" * 100)
    additional_stats = pd.DataFrame({
        'Column': numeric_cols,
        'Min': [df[col].min() for col in numeric_cols],
        'Max': [df[col].max() for col in numeric_cols],
        'Mean': [df[col].mean() for col in numeric_cols],
        'Median': [df[col].median() for col in numeric_cols],
        'Std': [df[col].std() for col in numeric_cols],
        'Skewness': [df[col].skew() for col in numeric_cols],
        'Unique_Values': [df[col].nunique() for col in numeric_cols]
    })
    print(additional_stats.to_string(index=False))
else:
    print("No numeric variables found!")


## 6. Categorical/String Variables Analysis


In [None]:
# Analyze categorical/string variables
if len(string_cols) > 0:
    print("Categorical/String Variables Analysis:")
    print("=" * 100)
    
    for col in string_cols:
        print(f"\n{col}:")
        print("-" * 80)
        print(f"  Data type: {df[col].dtype}")
        print(f"  Unique values: {df[col].nunique()}")
        print(f"  Unique value list: {sorted(df[col].unique())}")
        print(f"  Value counts:")
        value_counts = df[col].value_counts()
        for val, count in value_counts.items():
            pct = (count / len(df)) * 100
            print(f"    '{val}': {count:,} ({pct:.2f}%)")
else:
    print("No categorical/string variables found!")


## 7. Relationship Between Features and Target Variable

Let's examine how different features relate to damage_grade.


In [None]:
# Correlation between numeric features and damage_grade
if 'damage_grade' in df.columns and len(numeric_cols) > 0:
    # Exclude damage_grade from numeric_cols for correlation
    feature_numeric_cols = [col for col in numeric_cols if col != 'damage_grade']
    
    if len(feature_numeric_cols) > 0:
        correlations = df[feature_numeric_cols + ['damage_grade']].corr()['damage_grade'].sort_values(ascending=False)
        correlations = correlations.drop('damage_grade')  # Remove self-correlation
        
        print("Correlation with damage_grade (Numeric Features):")
        print("=" * 80)
        corr_df = pd.DataFrame({
            'Feature': correlations.index,
            'Correlation': correlations.values
        })
        print(corr_df.to_string(index=False))
        
        # Visualize correlations
        plt.figure(figsize=(10, max(6, len(correlations) * 0.4)))
        plt.barh(corr_df['Feature'], corr_df['Correlation'], color='coral', alpha=0.7)
        plt.xlabel('Correlation with damage_grade')
        plt.title('Feature Correlations with Target Variable (damage_grade)')
        plt.axvline(x=0, color='black', linestyle='--', linewidth=0.5)
        plt.tight_layout()
        plt.show()


In [None]:
# Analyze categorical variables vs damage_grade
if 'damage_grade' in df.columns and len(string_cols) > 0:
    print("\nCategorical Variables vs damage_grade:")
    print("=" * 100)
    
    # Show crosstab for each categorical variable
    for col in string_cols[:10]:  # Limit to first 10 to avoid too much output
        print(f"\n{col} vs damage_grade:")
        print("-" * 80)
        crosstab = pd.crosstab(df[col], df['damage_grade'], margins=True)
        print(crosstab)
        
        # Percentage crosstab
        print(f"\n{col} vs damage_grade (Percentages):")
        crosstab_pct = pd.crosstab(df[col], df['damage_grade'], normalize='index') * 100
        print(crosstab_pct.round(2))


## 8. Key Insights Summary


In [None]:
# Create a comprehensive summary
print("=" * 100)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("=" * 100)

print(f"\n1. Dataset Overview:")
print(f"   - Total rows: {df.shape[0]:,}")
print(f"   - Total columns: {df.shape[1]}")
print(f"   - Numeric variables: {data_types_summary['Is_Numeric'].sum()}")
print(f"   - String/Categorical variables: {data_types_summary['Is_String'].sum()}")

print(f"\n2. Missing Values:")
print(f"   - Columns with missing values: {(missing_data['NA_Count'] > 0).sum()}")
print(f"   - Total missing values: {missing_data['NA_Count'].sum():,}")

if 'damage_grade' in df.columns:
    print(f"\n3. Target Variable (damage_grade):")
    print(f"   - Data type: {df['damage_grade'].dtype}")
    print(f"   - Unique values: {sorted(df['damage_grade'].unique())}")
    print(f"   - Missing values: {df['damage_grade'].isna().sum()}")
    print(f"   - Distribution:")
    for grade, count in df['damage_grade'].value_counts().sort_index().items():
        pct = (count / len(df)) * 100
        print(f"     Grade {grade}: {count:,} ({pct:.2f}%)")

print(f"\n4. Data Quality:")
print(f"   - No missing values: {(missing_data['NA_Count'] == 0).sum()} columns")
print(f"   - Complete cases: {df.dropna().shape[0]:,} rows ({df.dropna().shape[0]/df.shape[0]*100:.2f}%)")

print("\n" + "=" * 100)
