In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

# Load train and test data
train_df = pd.read_csv("Train.csv", low_memory=False)
test_df = pd.read_csv("Test.csv", low_memory=False)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
train_df.head()


Train shape: (401125, 53)
Test shape: (12457, 52)


Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,11/16/2006 0:00,...,,,,,,,,,Standard,Conventional
1,1139248,57000,117657,77,121,3.0,1996,4640.0,Low,3/26/2004 0:00,...,,,,,,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,3.0,2001,2838.0,High,2/26/2004 0:00,...,,,,,,,,,,
3,1139251,38500,1026470,332,121,3.0,2001,3486.0,High,5/19/2011 0:00,...,,,,,,,,,,
4,1139253,11000,1057373,17311,121,3.0,2007,722.0,Medium,7/23/2009 0:00,...,,,,,,,,,,


In [2]:
# Convert sale date
train_df['saledate'] = pd.to_datetime(train_df['saledate'])
test_df['saledate'] = pd.to_datetime(test_df['saledate'])

# Extract date features
def add_date_features(df):
    df['saleYear'] = df['saledate'].dt.year
    df['saleMonth'] = df['saledate'].dt.month
    df['saleDay'] = df['saledate'].dt.day
    df['saleDayOfWeek'] = df['saledate'].dt.dayofweek
    df['saleDayOfYear'] = df['saledate'].dt.dayofyear
    return df

train_df = add_date_features(train_df)
test_df = add_date_features(test_df)

# Drop unnecessary columns
cols_to_drop = ['MachineID', 'ModelID', 'saledate']
train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=cols_to_drop, inplace=True)


In [4]:
# Fill missing values
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        train_df[col] = train_df[col].fillna("Unknown")
    else:
        train_df[col] = train_df[col].fillna(train_df[col].median())

for col in test_df.columns:
    if test_df[col].dtype == 'object':
        test_df[col] = test_df[col].fillna("Unknown")
    else:
        # Use training set's median for consistency
        if col in train_df.columns:
            test_df[col] = test_df[col].fillna(train_df[col].median())
        else:
            test_df[col] = test_df[col].fillna(test_df[col].median())

# Label encode categorical features (same mapping for both train and test)
categorical_cols = train_df.select_dtypes(include='object').columns

from sklearn.preprocessing import LabelEncoder

for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]]).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))


In [5]:
# Separate features and target
X = train_df.drop("SalePrice", axis=1)
y = train_df["SalePrice"]

# Train/Validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=8, random_state=42)
model.fit(X_train, y_train)

# Validation RMSLE
val_preds = model.predict(X_val)
val_preds = np.maximum(0, val_preds)  # avoid negative predictions

rmsle = np.sqrt(mean_squared_log_error(y_val, val_preds))
print("Validation RMSLE:", rmsle)


Validation RMSLE: 0.2414168509837724


In [7]:
# Ensure the same columns in test as in training
X_train_cols = X.columns  # This was used in training
X_test = test_df[X_train_cols].copy()

# Make predictions
test_preds = model.predict(X_test)
test_preds = np.maximum(0, test_preds)  # ensure no negative prices

# Load the original test file to get SalesID
original_test = pd.read_csv("Test.csv")
submission = pd.DataFrame({
    "SalesID": original_test["SalesID"],
    "SalePrice": test_preds
})

# Save the prediction file
submission.to_csv("test_predictions.csv", index=False)
print("✅ Submission file created: test_predictions.csv")


✅ Submission file created: test_predictions.csv
