<a href="https://www.kaggle.com/code/karishmabattina/backpack-xgboost?scriptVersionId=224703730" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Load train and test data
train_data = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv')

In [3]:
#check train data shape. 11 columns, 300000 rows
train_data.shape

(300000, 11)

In [4]:
#check test data shape. 10 columns(no price column), 200000 rows
test_data.shape

(200000, 10)

In [5]:
train_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [6]:
test_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [7]:
#check for null values in train data
train_data.isnull().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [8]:
#check for null values in test data
test_data.isnull().sum()

id                         0
Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64

In [9]:
#handle missing values in train data. Replace missing values with mode
train_data['Brand']=train_data['Brand'].fillna(train_data['Brand'].mode()[0])
train_data['Material']=train_data['Material'].fillna(train_data['Material'].mode()[0])
train_data['Size']=train_data['Size'].fillna(train_data['Size'].mode()[0])
train_data['Laptop Compartment']=train_data['Laptop Compartment'].fillna(train_data['Laptop Compartment'].mode()[0])
train_data['Waterproof']=train_data['Waterproof'].fillna(train_data['Waterproof'].mode()[0])
train_data['Style']=train_data['Style'].fillna(train_data['Style'].mode()[0])
train_data['Color']=train_data['Color'].fillna(train_data['Color'].mode()[0])
train_data['Weight Capacity (kg)']=train_data['Weight Capacity (kg)'].fillna(train_data['Weight Capacity (kg)'].mode()[0])

In [10]:
#handle missing values in test data. Replace missing values with mode
test_data['Brand']=test_data['Brand'].fillna(test_data['Brand'].mode()[0])
test_data['Material']=test_data['Material'].fillna(test_data['Material'].mode()[0])
test_data['Size']=test_data['Size'].fillna(test_data['Size'].mode()[0])
test_data['Laptop Compartment']=test_data['Laptop Compartment'].fillna(test_data['Laptop Compartment'].mode()[0])
test_data['Waterproof']=test_data['Waterproof'].fillna(test_data['Waterproof'].mode()[0])
test_data['Style']=test_data['Style'].fillna(test_data['Style'].mode()[0])
test_data['Color']=test_data['Color'].fillna(test_data['Color'].mode()[0])
test_data['Weight Capacity (kg)']=test_data['Weight Capacity (kg)'].fillna(test_data['Weight Capacity (kg)'].mode()[0])

In [11]:
# drop 'id' column in test data
test_data = pd.read_csv("/kaggle/input/playground-series-s5e2/test.csv").drop("id",axis=1)

from sklearn.model_selection import train_test_split

# Select subset of predictors to use for prediction
cols_to_use = ['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)']

# Convert the columns to type category
train_data[cols_to_use] = train_data[cols_to_use].astype("category")
test_data[cols_to_use] = test_data[cols_to_use].astype("category")

X = train_data[cols_to_use]

# Select target
y = train_data.Price

# Separate data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Using xgboost for prediction
from xgboost import XGBRegressor

my_model = XGBRegressor(enable_categorical = True,
                        device="cuda",
                        n_estimators=3000,
                        max_depth=9,
                        learning_rate=0.013,
                        n_jobs=4,
                        random_state=42,
                        subsample=0.9,
                        colsample_bytree=0.5,
                        min_child_weight=58,
                        reg_alpha=0.1,
                        reg_lambda=0.5
                        )

my_model.fit(X_train, y_train, 
             early_stopping_rounds=50, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)




In [13]:
from sklearn.metrics import mean_absolute_error

#calculate mean absolute error
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 33.663353248225384


In [14]:
sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e2/sample_submission.csv")

In [15]:
sample_submission["Price"] =  my_model.predict(test_data)
sample_submission.to_csv("submission.csv",index=False)
sample_submission

Unnamed: 0,id,Price
0,300000,81.985565
1,300001,83.085953
2,300002,81.882866
3,300003,81.498390
4,300004,81.103722
...,...,...
199995,499995,80.357185
199996,499996,81.104454
199997,499997,82.063637
199998,499998,82.066910
