In [1]:
import os

# Get the directory containing the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

# Change working directory to notebook folder
os.chdir(notebook_dir)

# Verify
print("Current working directory:", os.getcwd())

Current working directory: /home/j/jl1416/sta521/521PredictionProject-1


In [2]:
# -*- coding: utf-8 -*-
"""
Full preprocessing pipeline for train, test, and eval datasets
"""

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# -----------------------------
# Read datasets
# -----------------------------
train = pd.read_csv('data/raw/train.csv')
test = pd.read_csv('data/raw/test.csv')
evaluation = pd.read_csv('data/raw/nepal_evaluation.csv')

# -----------------------------
# Separate features and target
# -----------------------------
y_train = train['damage_grade'] - 1
X_train = train.drop(columns=['damage_grade'])

y_test = test['damage_grade'] - 1
X_test = test.drop(columns=['damage_grade'])

# -----------------------------
# Define feature groups
# -----------------------------
geo_features = ['geo_level_2_id', 'geo_level_3_id']

categorical_features = ['land_surface_condition', 'foundation_type', 'roof_type', 
                        'ground_floor_type', 'other_floor_type', 'position', 
                        'plan_configuration', 'legal_ownership_status', 'geo_level_1_id']

numeric_features = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
                    'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 
                    'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
                    'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
                    'has_superstructure_timber', 'has_superstructure_bamboo',
                    'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered',
                    'has_superstructure_other', 'count_families', 'has_secondary_use', 
                    'has_secondary_use_agriculture', 'has_secondary_use_hotel',
                    'has_secondary_use_rental', 'has_secondary_use_institution', 
                    'has_secondary_use_school', 'has_secondary_use_industry', 
                    'has_secondary_use_health_post', 'has_secondary_use_gov_office',
                    'has_secondary_use_use_police', 'has_secondary_use_other']

# -----------------------------
# Convert geo_level to string
# -----------------------------
for df in [X_train, X_test, evaluation]:
    df[geo_features] = df[geo_features].astype(str)

# -----------------------------
# Preprocessor pipeline
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features),
        ('geo', 'passthrough', geo_features) 
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# -----------------------------
# Fit on train, transform holdout and test
# -----------------------------
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)
X_eval_transformed = pipeline.transform(evaluation)

# -----------------------------
# Convert to DataFrames
# -----------------------------
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

X_train_df = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_transformed, columns=feature_names)
X_eval_df = pd.DataFrame(X_eval_transformed, columns=feature_names)

# Add building_id back to evaluation
X_eval_df['building_id'] = evaluation['building_id'].values

# Add target back to train and holdout
X_train_df['damage_grade'] = y_train
X_test_df['damage_grade'] = y_test

# -----------------------------
# Save ready CSVs
# -----------------------------
X_train_df.to_csv('data/cleaned/train_cleaned.csv', index=False)
X_test_df.to_csv('data/cleaned/test_cleaned.csv', index=False)
X_eval_df.to_csv('data/cleaned/evaluation_cleaned.csv', index=False)