# Project

### Import & load Data

In [0]:
pip install kaggle

In [0]:
import os

# Folder containing kaggle.json
os.environ['KAGGLE_CONFIG_DIR'] = "/Workspace/Users/20250355@novaims.unl.pt"

# Optional: test
!echo $KAGGLE_CONFIG_DIR

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.feature_selection import VarianceThreshold, RFE, chi2
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer


from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import Ridge, Lasso, LassoCV, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.svm import SVR

from data_cleaning import clean_car_dataframe

df_cars_train = pd.read_csv("train.csv")
df_cars_test = pd.read_csv("test.csv")

### Explorative Data Analysis (EDA)

TASK I (3 Points): Descriptive Statistics, Inconsistency Check, Visual Data Explorance, Extraction of Relevant Insights, Multivariate Relationships  => Explain

In [0]:
df_cars_train.describe(include="all").T

# Findings:
# column carID has no duplicates
# column year: 1970 to 2024
# column mileage: -58.000 to 323.000 
# column tax: -91 to 580
# column mpg: -43 to 470 
# column engineSize: -0.1 to 6.6 
# column paintQuality%: 70-100, few outliers 1.6 or 125  
# column previousOwners: -2.3 to 6.2 
# column hasDamage (0/nan, not sure if nan means damaged, convert to Int)
 


### Data Cleaning, Feature Engineering, Split & Preprocessing

Task II (5 Points): Clean and preprocess the dataset. 
- Missing Value handling, Outlier preprocessing + justify decisions -> in data_cleaning.py
- Review current features and create extra features if needed + explain
- Deal with categorical variables -> One-Hot-Encoding 
- Perform data scaling, explain reasoning

In [0]:
# Outlier Preprocessing happens here
df_cars_train = clean_car_dataframe(df_cars_train)
df_cars_test = clean_car_dataframe(df_cars_test)


# Safety Check: print all unique values of all columns of df_cars_train // df_cars_test to see if data cleaning worked and if there are still weird values
for col in df_cars_train.columns:
    print(col, df_cars_train[col].unique())
print("X"*150)
for col in df_cars_test.columns:
    print(col, df_cars_test[col].unique())

In [0]:
# Feature Engineering and Explaination

# add column age: models can easier interpret linear numerical features
df_cars_train['age'] = 2025 - df_cars_train['year']
df_cars_test['age'] = 2025 - df_cars_test['year']

# miles per year: normalizes the total mileage by how old the car is
df_cars_train['miles_per_year'] = df_cars_train['mileage'] / df_cars_train['age'].replace({0: np.nan})
df_cars_train['miles_per_year'] = df_cars_train['miles_per_year'].fillna(df_cars_train['mileage'])

df_cars_test['miles_per_year'] = df_cars_test['mileage'] / df_cars_test['age'].replace({0: np.nan})
df_cars_test['miles_per_year'] = df_cars_test['miles_per_year'].fillna(df_cars_test['mileage'])

# model frequency: some models are more common, which means they can be cheaper (supply) or retain their values better (demand). freq shows their popularity
model_freq = df_cars_train['model'].value_counts(normalize=True).to_dict()
df_cars_train['model_freq'] = df_cars_train['model'].map(model_freq)

model_freq = df_cars_test['model'].value_counts(normalize=True).to_dict()
df_cars_test['model_freq'] = df_cars_test['model'].map(model_freq)

# brand median price (only train): shows brand positioning (e.g. BMW > KIA)
brand_median_price = df_cars_train.groupby('Brand')['price'].median()
df_cars_train['brand_med_price'] = df_cars_train['Brand'].map(brand_median_price)

# model median price (only train): shows model positioning (e.g. 3er > 1er)
model_med_price = df_cars_train.groupby('model')['price'].median()
df_cars_train['model_med_price'] = df_cars_train['model'].map(model_med_price)

In [0]:
X = df_cars_train.drop(columns='price')
y = df_cars_train['price']

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.3, 
                                                  random_state = 42, 
                                                  #stratify = y,    # if y, class proportions get preserved between train and test sets
                                                  shuffle = True)

In [0]:
# Define which columns are numeric vs categorical (mileage not in here because skewed - log)
numeric_features = ["age", "tax", "mpg", "engineSize", "paintQuality", "previousOwners"]
log_features = ["mileage", "miles_per_year", "model_freq", "brand_med_price", "model_med_price"] # could try to test previousOwners, tax, age here
categorical_features = ["Brand", "model", "transmission", "fuelType"]

# left out columns: year (age is better), hasDamage (unsure what the two values 0 and NaN mean)


log_transformer = Pipeline([
    # Handling of missing numerical values with sklearn SimpleImputer (mean)
    ("imputer", SimpleImputer(strategy="mean")),
    # Data Scaling with sklearn FunctionTransformer (for log) and StandardScaler
    ("log", FunctionTransformer(np.log1p, validate=False)),
    ("scaler", StandardScaler())
])

numeric_transformer = Pipeline([
    # Handling of missing numerical values with sklearn SimpleImputer (mean)
    ("imputer", SimpleImputer(strategy="mean")),
    # Data Scaling with sklearn StandardScaler
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    # Handling of missing categorical values with sklearn SimpleImputer (Unknown)
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    # Deal with Categorical Variables with sklearn OneHotEncoder:
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) 
])

preprocessor = ColumnTransformer([
    # Apply the preprocessing steps to the data
    ("mileage", log_transformer, ["mileage"]), # log because mileage is skewed
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
], remainder="drop")


# Fit preprocessor on training data - avoid data leakage
preprocessor.fit(X_train)

### Feature Selection

Task III (3 Points): Define and Implement a clear and unambiguous strategy for feature selection. Use the methods discussed in the course. Present and justify your final selection 

Model independent Filter Methods:
- Remove constant numerical variables with VarianceThreshold (manual)
- Check highly correlated numerical variables and keep one with Spearman (manual)
- Remove unindependent categorical variables with Chi2

Model dependent Wrapper Methods:
- RFE LR / RFE SVR for linear Models: Ridge, Lasso, ElasticNet, SVM
- Feature Importance for tree Models: DecisionTrees, RandomForest, GradientBoosting => trees are unsensitive to irrelevant features but doing feature importance and remove some can reduce dimensionality
- L1 Regularization for Neural Networks: MLP


In [0]:
X_train_proc = preprocessor.transform(X_train)

feature_names_all = []
for name, trans, cols in preprocessor.transformers_:
    if name != 'remainder':
        if hasattr(trans, 'get_feature_names_out'):
            # for categorical OHE
            try:
                feature_names_all.extend(trans.get_feature_names_out(cols))
            except:
                feature_names_all.extend(cols)
        else:
            feature_names_all.extend(cols)

X_df = pd.DataFrame(X_train_proc, columns=feature_names_all)


# Variance Threshold
vt = VarianceThreshold(threshold=0.0)
vt.fit(X_df)
vt_deselect = [f for f, keep in zip(feature_names_all, vt.get_support()) if not keep]
print("Features to deselect according to VarianceThreshold:", vt_deselect)


# Spearman correlation (numeric + log only)
numeric_log = numeric_features + log_features
spearman_deselect = []
for f in numeric_log:
    if f in X_df.columns:
        corr, _ = spearmanr(X_df[f], y_train)
        if abs(corr) <= 0.05:
            spearman_deselect.append(f)
print("Features to deselect according to Spearman correlation:", spearman_deselect)


# Chi2 (categorical only, must be non-negative)
cat_cols = [c for c in X_df.columns if c not in numeric_log]
X_cat = X_df[cat_cols].astype(float)
chi2_vals, _ = chi2(X_cat, y_train)
chi2_deselect = [f for f, val in zip(cat_cols, chi2_vals) if val <= 0]
print("Features to deselect according to ChiÂ²:", chi2_deselect)


In [0]:
# Numeric/log features for linear models
linear_numeric_features = [f for f in numeric_features + log_features if f not in spearman_deselect]

preprocessor_linear = ColumnTransformer([
    ("num", numeric_transformer, linear_numeric_features),
    ("cat", categorical_transformer, categorical_features)
], remainder="drop")

# => use preprocessor_linear for linear model setup; since tree models are indifferent to irrelevant features

### Models Setup and Baselining (with SKLEARN)

TASK IV (4 Points): Build a simple model and assess the performance
- Identify the type of problem and select the relevant algorithms
- Select one Model Assessment Strategy to use throughout your work. Which metrics are you using to evaluate your model and why?


In [0]:
# Following metrics are used for model evaluation:
#
#   MAE: Average absolute deviation between predicted and true car prices, easy to interpret, kaggle competition uses same metric
#   RMSE: Root mean squared error, helps to see if large errors on same values were made, therefore sensitive to outliers
#   R2: Proportion of variance explained by the model, 1 = perfect, 0 = same as predicting mean, < 0 = worse than mean baseline
#
# Because our task is a regression problem and we are predicting a continuous variable (car price)

def print_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"MAE: {mae:.4f} | RMSE: {rmse:.4f} | R2: {r2:.4f}")
    return

In [0]:
### LINEAR MODEL
# ElasticNet
elastic_pipe = Pipeline([
    ("preprocess", preprocessor_linear),
    ("model", ElasticNet(
        max_iter=20000,
        selection="random",
        warm_start=False,  # set True only if iteratively tuning manually
        random_state=42
    ))
])


### TREE MODELS
# HistGradientBoostingRegressor: modern and very fast, handles missing values natively (no imputation needed!). often matches or beats XGBoost/LightGBM 
hgb_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", HistGradientBoostingRegressor(
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=20,
        l2_regularization=0.5,  # regularize slightly to prevent overfit, try 0.1, 0.5, 1.0
    ))
])

In [0]:
mean_pred = y_train.mean()
median_pred = y_train.median()

print("baseline mean predictor: ")
print_metrics(y_val, [mean_pred]*len(y_val))
# MAE: 6976.3626 | RMSE: 92839550.2849 | R2: -0.0000

print("-"*150)

print("baseline median predictor: ") 
print_metrics(y_val, [median_pred]*len(y_val))
# MAE: 6751.1604 | RMSE: 97557866.6363 | R2: -0.0508

### Hyperparameter Tuning

In [0]:
elastic_param_grid = {
    "model__alpha": [0.001, 0.01, 0.05, 0.1, 0.5],
    "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]     
}

elastic_grid = GridSearchCV(
    elastic_pipe, 
    param_grid=elastic_param_grid,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=1
)

elastic_grid.fit(X_train, y_train)
elastic_best = elastic_grid.best_estimator_
elastic_val_pred = elastic_best.predict(X_val)


print("ElasticNet Results: ")
print_metrics(y_val, elastic_val_pred)
print("Best ElasticNet params:", elastic_grid.best_params_)

In [0]:
hgb_param_grid = {
    "model__learning_rate": [0.07], # also tried: 0.02, 0.04, 0.06, 0.1
    "model__max_leaf_nodes": [60], # also tried: 15, 25, 31, 50
    "model__min_samples_leaf": [8], # also tried: 5, 10, 15, 20
    "model__max_iter": [1000] # also tried: 500, 800
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

hgb_grid = GridSearchCV(
    estimator=hgb_pipe,
    param_grid=hgb_param_grid,
    cv=cv,
    scoring="neg_mean_absolute_error",  # optimize MAE
    n_jobs=-1,
    verbose=2
)

hgb_grid.fit(X_train, y_train)
hgb_best_1 = hgb_grid.best_estimator_

hgb_val_pred = hgb_best_1.predict(X_val)
print_metrics(y_val, hgb_val_pred)

# Best Parameters: {'model__learning_rate': 0.06, 'model__max_iter': 800, 'model__max_leaf_nodes': 50, 'model__min_samples_leaf': 5}
# MAE: 1304.7611 | RMSE: 4503446.5247 | R2: 0.9515

# Save model for later use
joblib.dump(hgb_best_1, "hgb_best_1.pkl")

### Model Evaluation

Extra Task (1 Point): Be in the Top 5 Groups on Kaggle

In [0]:
# load hgb_best_1 from joblib
hgb_best_1 = joblib.load("hgb_best_1.pkl")

In [0]:
# Pick best model and predict on test:
df_cars_test['price'] = hgb_best_1.predict(df_cars_test)

df_cars_test['price'].to_csv('submission.csv', index=True)


In [0]:
!kaggle competitions submit -c cars4you -f submission.csv -m "Message"

In [0]:
# Kaggle Score Check
!kaggle competitions submissions -c cars4you