# 2. Data Preprocessing

In this section, I process the raw data to make it suitable for machine learning. This involves converting text categories into numbers (encoding) and scaling the features.

## 2.1 Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
import os

# Load data
train_df = pd.read_csv('../data/raw/data.csv')
test_df = pd.read_csv('../data/raw/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Training data shape: (100000, 19)
Test data shape: (100000, 18)


## 2.2 Boolean Encoding

Convert Yes/No strings to 1/0 for the model.

In [2]:
bool_cols = ['gas', 'hot_water', 'central_heating']

def encode_boolean(df, columns):
    """Convert Yes/No strings to binary 1/0 values."""
    df = df.copy()
    for col in columns:
        df[col] = df[col].map({'Yes': 1, 'No': 0})
    return df

# Apply encoding to both datasets
train_df = encode_boolean(train_df, bool_cols)
test_df = encode_boolean(test_df, bool_cols)

print("Boolean encoding applied.")
print(f"Sample values after encoding:")
print(train_df[bool_cols].head())

Boolean encoding applied.
Sample values after encoding:
   gas  hot_water  central_heating
0    0          0                1
1    1          0                1
2    1          1                1
3    0          1                1
4    0          1                1


## 2.3 Label Encoding

Convert categorical text features to numbers.

In [3]:
cat_cols = ['district_name', 'extra_area_type_name']
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    # Fit on combined data to ensure consistent encoding
    combined = pd.concat([train_df[col], test_df[col]])
    le.fit(combined)
    
    # Transform both datasets
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le

print("Label encoding applied.")
print("\nCategories per feature:")
for col, le in label_encoders.items():
    print(f"  {col}: {list(le.classes_)}")

Label encoding applied.

Categories per feature:
  district_name: ['Centralnyj', 'Kirovskij', 'Krasnoselskij', 'Moskovskij', 'Nevskij', 'Petrogradskij', 'Vyborgskij']
  extra_area_type_name: ['balcony', 'loggia']


## 2.4 Separate Features and Target

In [4]:
# Define feature columns (exclude index and price)
feature_cols = [col for col in train_df.columns if col not in ['index', 'price']]

print(f"Number of features: {len(feature_cols)}")
print(f"\nFeatures: {feature_cols}")

# Separate features and target
X = train_df[feature_cols]
y = train_df['price']

# Prepare test data
X_test_final = test_df[feature_cols]
test_index = test_df['index']

print(f"\nTraining features shape: {X.shape}")
print(f"Test features shape: {X_test_final.shape}")

Number of features: 17

Features: ['kitchen_area', 'bath_area', 'other_area', 'gas', 'hot_water', 'central_heating', 'extra_area', 'extra_area_count', 'year', 'ceil_height', 'floor_max', 'floor', 'total_area', 'bath_count', 'extra_area_type_name', 'district_name', 'rooms_count']

Training features shape: (100000, 17)
Test features shape: (100000, 17)


## 2.5 Train/Validation Split

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2,      # 20% for validation
    random_state=42     # Reproducibility
)

print(f"Training set:   {X_train.shape[0]:,} samples")
print(f"Validation set: {X_val.shape[0]:,} samples")

Training set:   80,000 samples
Validation set: 20,000 samples


## 2.6 Feature Scaling

StandardScaler normalizes features to zero mean and unit variance. This is important for neural networks.

In [6]:
scaler = StandardScaler()

# Fit on training data only, then transform all sets
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_final)

print("Scaling applied.")
print(f"\nTraining data - Mean: {X_train_scaled.mean():.6f}, Std: {X_train_scaled.std():.6f}")
print(f"Validation data - Mean: {X_val_scaled.mean():.6f}, Std: {X_val_scaled.std():.6f}")

Scaling applied.

Training data - Mean: -0.000000, Std: 1.000000
Validation data - Mean: -0.000146, Std: 0.998999


## 2.7 Save Preprocessed Data

In [7]:
# Create directories if they don't exist
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Save processed data
joblib.dump(X_train, '../data/processed/X_train.pkl')
joblib.dump(X_val, '../data/processed/X_val.pkl')
joblib.dump(y_train, '../data/processed/y_train.pkl')
joblib.dump(y_val, '../data/processed/y_val.pkl')

# Save scaled data (for neural network)
joblib.dump(X_train_scaled, '../data/processed/X_train_scaled.pkl')
joblib.dump(X_val_scaled, '../data/processed/X_val_scaled.pkl')

# Save test data
joblib.dump(X_test_final, '../data/processed/X_test.pkl')
joblib.dump(X_test_scaled, '../data/processed/X_test_scaled.pkl')
joblib.dump(test_index, '../data/processed/test_index.pkl')

# Save feature column names
joblib.dump(feature_cols, '../data/processed/feature_cols.pkl')

# Save transformers
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(label_encoders, '../models/label_encoders.pkl')

print("All data and transformers saved successfully.")
print("\nSaved files:")
print("  Data:        ../data/processed/*.pkl")
print("  Transformers: ../models/scaler.pkl, label_encoders.pkl")

All data and transformers saved successfully.

Saved files:
  Data:        ../data/processed/*.pkl
  Transformers: ../models/scaler.pkl, label_encoders.pkl


## 2.8 Conclusion

The data is now fully numeric and ready for training. I saved the clean datasets and the encoders so the API can use them later.

**Final Status:**
- Training Set: 80,000 samples
- Validation Set: 20,000 samples
- Features: 17 columns (Scaled for Neural Network, Unscaled for XGBoost)