In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import os
import time
import re # Import regular expressions for parsing
from sklearn.metrics import r2_score

## load data

In [None]:

# --- 1. Load All Data ---
try:
    df_train_small_full = pd.read_csv("small_train_data.csv") # 100k rows
    df_train_large_full = pd.read_csv("large_train_data.csv") # 500k rows
    df_test_full = pd.read_csv("test_without_response_variable.csv") # 60k rows
except Exception as e:
    print(f"Error loading files. Make sure all 3 files are uploaded:")
    print("small_train_data.csv, large_train_data.csv, test_without_response_variable.csv")
    print(e)
    raise

# --- 2. Combine and Sample Training Data ---
# Stack the small and large training sets
df_train_combined = pd.concat([df_train_small_full, df_train_large_full], ignore_index=True)
print(f"Total combined training rows: {len(df_train_combined)}")

# Let's use 40% of the total combined data
SAMPLE_FRACTION = 0.4
df_train = df_train_combined.sample(frac=SAMPLE_FRACTION, random_state=42).reset_index(drop=True)

# --- 3. Create & Impute Target Variable (log_price) ---
TARGET_VARIABLE = 'log_price'
# Calculate log_price, allowing -inf (from log(0)) or NaN (from log(-))
with np.errstate(divide='ignore', invalid='ignore'):
    df_train[TARGET_VARIABLE] = np.log(df_train['price'])

# Find all valid log_price values
valid_log_prices = df_train[np.isfinite(df_train[TARGET_VARIABLE])][TARGET_VARIABLE]

# Calculate the mean of only the *valid* log_prices
log_price_mean = valid_log_prices.mean()

# Replace all invalid log_prices (NaN, inf, -inf) with the mean
df_train[TARGET_VARIABLE] = df_train[TARGET_VARIABLE].replace([np.inf, -np.inf, np.nan], log_price_mean)


# --- 4. Data Setup ---
X_train = df_train.drop(columns=['price', TARGET_VARIABLE], errors='ignore')
y_train = df_train[TARGET_VARIABLE]
# full test set
#X_test = df_test_full
# Use a 1/10th sample of the test set
# We save the original sampled dataframe (df_test_small) to check results later
df_test_small = df_test_full.sample(frac=0.1, random_state=42).reset_index(drop=True)
X_test = df_test_small


print(f"Using {len(X_train)} sampled training observations.")
print(f"Using {len(X_test)} test observations.")

Total combined training rows: 600000
Using 240000 sampled training observations.
Using 6000 test observations.


## Preprocessing

In [None]:
# --- 1. Helper Function to Parse Strings ---
def parse_string_to_num(series):
    return pd.to_numeric(
        series.astype(str).str.extract(r'([\d\.]+)', expand=False),
        errors='coerce'
    )

# --- 2. Main Advanced Preprocessing Function ---
def advanced_preprocess_features(df_in):
    df = df_in.copy()
    df['power_num'] = parse_string_to_num(df['power'])
    df['torque_num'] = parse_string_to_num(df['torque'])
    df['back_legroom_num'] = parse_string_to_num(df['back_legroom'])
    df['front_legroom_num'] = parse_string_to_num(df['front_legroom'])
    df['fuel_tank_volume_num'] = parse_string_to_num(df['fuel_tank_volume'])
    df['height_num'] = parse_string_to_num(df['height'])
    df['length_num'] = parse_string_to_num(df['length'])
    df['width_num'] = parse_string_to_num(df['width'])
    df['wheelbase_num'] = parse_string_to_num(df['wheelbase'])
    df['engine_cylinders_num'] = pd.to_numeric(
        df['engine_cylinders'].astype(str).str.replace(r'\D', ''),
        errors='coerce'
    )
    df['maximum_seating_num'] = pd.to_numeric(
        df['maximum_seating'].astype(str).str.replace(r'\D', ''),
        errors='coerce'
    )
    df['major_options_count'] = df['major_options'].str.count("'").fillna(0) // 2
    current_year = 2024
    df['age'] = current_year - df['year']
    df['mileage_per_year'] = df['mileage'] / (df['age'] + 1)
    return df

# --- 3. Apply Advanced Preprocessing ---
print("Starting advanced preprocessing...")
X_train = advanced_preprocess_features(X_train)
X_test = advanced_preprocess_features(X_test)
print("Advanced preprocessing finished.")

# --- 4. Define ALL Features for the Model ---
NUMERICAL_FEATURES_ORIG = [
    'year', 'mileage', 'horsepower', 'city_fuel_economy',
    'highway_fuel_economy', 'engine_displacement'
]
NUMERICAL_FEATURES_NEW = [
    'power_num', 'torque_num', 'back_legroom_num', 'front_legroom_num',
    'fuel_tank_volume_num', 'height_num', 'length_num', 'width_num',
    'wheelbase_num', 'engine_cylinders_num', 'maximum_seating_num',
    'major_options_count', 'age', 'mileage_per_year'
]
CATEGORICAL_FEATURES = [
    'make_name', 'body_type', 'fuel_type', 'transmission_display'
]
all_numerical_features = NUMERICAL_FEATURES_ORIG + NUMERICAL_FEATURES_NEW

# =======================================================================
# --- 5. Imputation (Fill NaNs) - ROBUST FIX for X ---
print("Starting robust imputation...")
for col in all_numerical_features:
    mean_val = X_train[col].mean()

    # Check if the mean itself is NaN (the whole column was NaN)
    if pd.isna(mean_val):
        mean_val = 0 # Default to 0

    X_train[col] = X_train[col].fillna(mean_val)
    X_test[col] = X_test[col].fillna(mean_val)

for col in CATEGORICAL_FEATURES:
    X_train[col] = X_train[col].fillna('Missing')
    X_test[col] = X_test[col].fillna('Missing')
print("Imputation finished.")
# =======================================================================

# --- 6. Preprocessing Pipeline (Scaling & OHE) ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), all_numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
    ],
    remainder='drop'
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_train_processed = X_train_processed.toarray()
X_test_processed = X_test_processed.toarray()
INPUT_DIM = X_train_processed.shape[1]

print(f"\n--- Preprocessing Complete ---")
print(f"Total input features for the NN (after parsing and OHE): {INPUT_DIM}")

Starting advanced preprocessing...
Advanced preprocessing finished.
Starting robust imputation...
Imputation finished.

--- Preprocessing Complete ---
Total input features for the NN (after parsing and OHE): 122


## NN Hyperparameter Tuning Loop

In [None]:
# --- 1. Define Model & Training Parameters ---
nodes_to_test = [64, 128, 256, 512] # The list of layer sizes to try
LEARNING_RATE = 0.001
BATCH_SIZE = 32
EPOCHS = 100 # Use a high number with Early Stopping
tuning_results = {}

# --- 2. Split Data (Once) ---
# We use the same validation split for all tuning runs
X_train_fit, X_val_fit, y_train_fit, y_val_fit = train_test_split(
    X_train_processed, y_train, test_size=0.2, random_state=42
)

# --- 3. Tuning Loop ---
print(f"\n--- Starting NN Hyperparameter Tuning Loop ---")
print(f"Training on {len(X_train_fit)} samples, Validating on {len(X_val_fit)} samples.")

for nodes in nodes_to_test:
    print(f"\n--- Testing HIDDEN_NODES = {nodes} ---")
    start_time = time.time()

    # --- Build the Model ---
    model = Sequential([
        Dense(nodes, activation='relu', input_shape=(INPUT_DIM,)),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss='mse')

    # --- Train the Model ---
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    history = model.fit(
        X_train_fit, y_train_fit,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_val_fit, y_val_fit),
        callbacks=[early_stop],
        verbose=0 # Set to 0 to keep the log clean
    )

    end_time = time.time()

    # --- Get Best MSE and R-squared ---
    # Find the best MSE (val_loss) from this run
    best_mse = min(history.history['val_loss'])

    # Calculate R-squared on validation set
    y_pred_validation = model.predict(X_val_fit).flatten()
    r2_val = r2_score(y_val_fit, y_pred_validation)

    # Store results
    tuning_results[nodes] = {'mse': best_mse, 'r2': r2_val, 'time': end_time - start_time}
    print(f"Finished in {end_time - start_time:.2f}s. Best Validation MSE: {best_mse:.6f}, Validation R2: {r2_val:.6f}")

# --- 4. Report Final Results ---
print("\n--- Tuning Complete: Results ---")
print("Nodes | Best Val MSE (Lower is Better) | Val R-Squared (Higher is Better)")
print("------|--------------------------------|---------------------------------")

best_nodes = 0
best_mse = float('inf')

for nodes, metrics in tuning_results.items():
    print(f"{nodes:<5} | {metrics['mse']:<30.6f} | {metrics['r2']:<30.6f}")
    if metrics['mse'] < best_mse:
        best_mse = metrics['mse']
        best_nodes = nodes

print("\n--- Recommendation ---")
print(f"The best performing model used {best_nodes} nodes in the hidden layer.")
print(f"It achieved a Validation MSE of: {best_mse:.6f}")


--- Starting NN Hyperparameter Tuning Loop ---
Training on 144000 samples, Validating on 36000 samples.

--- Testing HIDDEN_NODES = 64 ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Finished in 473.37s. Best Validation MSE: 0.011442, Validation R2: 0.925758

--- Testing HIDDEN_NODES = 128 ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Finished in 377.54s. Best Validation MSE: 0.011143, Validation R2: 0.927696

--- Testing HIDDEN_NODES = 256 ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Finished in 525.05s. Best Validation MSE: 0.010832, Validation R2: 0.929716

--- Testing HIDDEN_NODES = 512 ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Finished in 332.62s. Best Validation MSE: 0.011939, Validation R2: 0.922530

--- Tuning Complete: Results ---
Nodes | Best Val MSE (Lower is Better) | Val R-Squared (Higher is Better)
------|--------------------------------|---------------------------------
64    | 0.011442                       | 0.925758                      
128   | 0.011143                       | 0.927696                      
256   | 0.010832                       | 0.929716                      
512   | 0.011939                       | 0.922530                      

--- Recommendation ---
The best performing model used 256 nodes in the hidden layer.
It achieved a Validation MSE of: 0.010832


## Train the Final Winning Model

In [None]:
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau

# --- 1. Define Winning Hyperparameters ---
BEST_NODES = 256               # The winner from tuning
LEARNING_RATE = 0.001
BATCH_SIZE = 32
EPOCHS = 150
L2_DECAY = 0.0001
DROPOUT_RATE = 0.1

# --- 2. Split Data (for training the final model) ---
# We use the same split to be consistent
X_train_fit, X_val_fit, y_train_fit, y_val_fit = train_test_split(
    X_train_processed, y_train, test_size=0.2, random_state=42
)

# --- 3. Build the Final Model ---
print(f"\n--- Building the Final Winning Model (Nodes = {BEST_NODES}) ---")
final_model = Sequential([
    Dense(
        BEST_NODES,
        activation='relu',
        kernel_regularizer=l2(L2_DECAY),
        input_shape=(INPUT_DIM,)
    ),
    BatchNormalization(),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='linear')
])

# --- 4. Compile the Final Model ---
final_model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss='mse')

# --- 5. Define Callbacks ---
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)
early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

# --- 6. Train the Final Model ---
print(f"Training final model on {len(X_train_fit)} samples...")
start_time = time.time()
history = final_model.fit(
    X_train_fit, y_train_fit,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val_fit, y_val_fit),
    callbacks=[early_stop, lr_scheduler],
    verbose=1 # Set to 1 to watch it train one last time
)
end_time = time.time()

best_mse = min(history.history['val_loss'])
print(f"\n--- Final Model Trained in {end_time - start_time:.2f}s ---")
print(f"Final Model Best Validation MSE: {best_mse:.6f}")
print("The 'final_model' variable is now ready for prediction.")


--- Building the Final Winning Model (Nodes = 256) ---
Training final model on 192000 samples...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/150
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3ms/step - loss: 8.4899 - val_loss: 0.0375 - learning_rate: 0.0010
Epoch 2/150
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 0.1392 - val_loss: 0.0348 - learning_rate: 0.0010
Epoch 3/150
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4ms/step - loss: 0.0783 - val_loss: 0.0202 - learning_rate: 0.0010
Epoch 4/150
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 0.0645 - val_loss: 0.0203 - learning_rate: 0.0010
Epoch 5/150
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3ms/step - loss: 0.0598 - val_loss: 0.0220 - learning_rate: 0.0010
Epoch 6/150
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 0.0600 - val_loss: 0.0223 - learning_rate: 0.0010
Epoch 7/150
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - loss: 0.0558 - val_l

## Create the Final Submission CSV

In [None]:
# --- 1. Load the FULL Test Set ---
X_test_full = df_test_full

# --- 2. Apply Advanced Preprocessing to FULL Test Set ---
# We must run the same feature engineering and parsing
print("Applying advanced preprocessing to full test set...")
X_test_full = advanced_preprocess_features(X_test_full)

# --- 3. Apply Imputation to FULL Test Set ---
# We use the same imputation rules as before
print("Applying imputation to full test set...")
for col in all_numerical_features:
    mean_val = X_train[col].mean() # Use mean from the TRAINING set
    if pd.isna(mean_val):
        mean_val = 0
    X_test_full[col] = X_test_full[col].fillna(mean_val)

for col in CATEGORICAL_FEATURES:
    X_test_full[col] = X_test_full[col].fillna('Missing')

# --- 4. Apply Preprocessor to FULL Test Set ---
# Use the fitted preprocessor to transform the test data
print("Applying fitted preprocessor to full test set...")
X_test_full_processed = preprocessor.transform(X_test_full)
X_test_full_processed = X_test_full_processed.toarray()

print(f"Test set shape after processing: {X_test_full_processed.shape}")

# --- 5. Generate Final Predictions ---
print("Generating final predictions on full test set...")
final_log_price_predictions = final_model.predict(X_test_full_processed).flatten()
print(f"Generated {len(final_log_price_predictions)} predictions.")

# --- 6. Create the Final Submission CSV ---
YOUR_ANONYMIZED_NAME = "my_anonymized_name"
YOUR_STUDENT_ID = "my_student_id"
# ---------------------------------------------
nodes_line = str(BEST_NODES) # This is 256

submission_file = "submission_q1.csv"
try:
    with open(submission_file, 'w') as f:
        f.write(f"{YOUR_ANONYMIZED_NAME}\n")
        f.write(f"{YOUR_STUDENT_ID}\n")
        f.write(f"{nodes_line}\n")

        for pred in final_log_price_predictions:
            f.write(f"{pred}\n")

    print(f"Successfully created '{submission_file}'.")

    # Print a summary to double-check
    print("\n--- Submission File Check ---")
    print(f"Line 1 (Name): {YOUR_ANONYMIZED_NAME}")
    print(f"Line 2 (ID): {YOUR_STUDENT_ID}")
    print(f"Line 3 (Nodes): {nodes_line}")
    print(f"Line 4 (First Pred): {final_log_price_predictions[0]}")
    print(f"Total Predictions: {len(final_log_price_predictions)}")

except Exception as e:
    print(f"\nAn error occurred while writing the file: {e}")

Applying advanced preprocessing to full test set...
Applying imputation to full test set...
Applying fitted preprocessor to full test set...
Test set shape after processing: (60000, 122)
Generating final predictions on full test set...
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Generated 60000 predictions.
Successfully created 'submission.csv'.

--- Submission File Check ---
Line 1 (Name): my_anonymized_name
Line 2 (ID): my_student_id
Line 3 (Nodes): 256
Line 4 (First Pred): 9.701264381408691
Total Predictions: 60000


## Validation set's $R^2$ and MSE

In [None]:

print("\n--- Final Model Performance on Validation Set ---")

# 1. Get predictions for the validation set using the final_model
y_pred_validation = final_model.predict(X_val_fit).flatten()

# 2. Get the true values (the log_price)
y_true_validation = y_val_fit

# 3. Calculate R-squared
# r2_score(y_true, y_pred)
r2_val = r2_score(y_true_validation, y_pred_validation)

# Get the MSE from the history object (it's the last recorded val_loss)
final_mse = history.history['val_loss'][-1]

print(f"Final Model Validation MSE: {final_mse:.6f}")
print(f"Final Model Validation R-Squared (R2): {r2_val:.6f}")


--- Final Model Performance on Validation Set ---
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Final Model Validation MSE: 0.011930
Final Model Validation R-Squared (R2): 0.927570
