<a href="https://www.kaggle.com/code/datascientistsohail/backpack-ml-season-5-ep-2?scriptVersionId=220625371" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### Backpack Prediction Challenge
This notebook runs XGBRegressor for the regression. The Dataset preprocessing includes handling null values using SimpleImputer and handling categorical using OneHotEncoder. Also, cross validation for 5 folds.  

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

### Data 

In [2]:
df = pd.read_csv('../input/playground-series-s5e2/train.csv', index_col = 'id')
df_test = pd.read_csv('../input/playground-series-s5e2/test.csv', index_col = 'id')
submission = pd.read_csv('../input/playground-series-s5e2/sample_submission.csv')

In [3]:
df.head()

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [4]:
df.shape, df_test.shape

((300000, 10), (200000, 9))

In [5]:
print('Null Values in train set: ', df.isnull().sum().sum())
print('Null Values in test set: ', df_test.isnull().sum().sum())

Null Values in train set:  57199
Null Values in test set:  38009


In [6]:
df['Price'].isnull().sum()

0

In [7]:
#obj_cols = [c for c in df.columns if df[c].dtype == 'object']
cat_cols = [c for c in df.columns if df[c].nunique() < 10]

In [8]:
num_cols = [c for c in df.columns if c not in cat_cols]
num_cols

['Compartments', 'Weight Capacity (kg)', 'Price']

In [9]:
num_cols.remove('Price')
num_cols

['Compartments', 'Weight Capacity (kg)']

In [10]:
sets = []
for col in cat_cols:
    num = df[col].nunique()
    sets.append(num)

sets_categorical = dict(zip(cat_cols, sets))
print(sets_categorical)  

{'Brand': 5, 'Material': 4, 'Size': 3, 'Laptop Compartment': 2, 'Waterproof': 2, 'Style': 3, 'Color': 6}


### Preprocessing 

### Handling Null Values

In [11]:
cat_imputer = SimpleImputer(strategy = 'most_frequent')
num_imputer = SimpleImputer(strategy = 'mean')
num_imputed_df = pd.DataFrame(num_imputer.fit_transform(df[num_cols]), columns = num_cols)
num_imputed_df_test = pd.DataFrame(num_imputer.transform(df_test[num_cols]), columns = num_cols)

num_imputed_df.index = df.index
num_imputed_df_test.index = df_test.index

In [12]:
cat_imputed_df = pd.DataFrame(cat_imputer.fit_transform(df[cat_cols]), columns = cat_cols)
cat_imputed_df_test = pd.DataFrame(cat_imputer.transform(df_test[cat_cols]), columns = cat_cols)

In [13]:
print('Null Values: ', num_imputed_df.isnull().sum().sum())
print('Null Values in test set: ',num_imputed_df_test.isnull().sum().sum())

Null Values:  0
Null Values in test set:  0


### Handling Categorical Values

In [14]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_df = pd.DataFrame(encoder.fit_transform(cat_imputed_df[cat_cols]), columns=encoder.get_feature_names_out())
encoded_df_test = pd.DataFrame(encoder.transform(cat_imputed_df_test[cat_cols]), columns=encoder.get_feature_names_out())

encoded_df.index = df.index
encoded_df_test.index = df_test.index

In [15]:
print('Null Values: ', encoded_df.isnull().sum().sum())
print('Null Values in test set: ',encoded_df_test.isnull().sum().sum())

Null Values:  0
Null Values in test set:  0


In [16]:
X = pd.concat([num_imputed_df, encoded_df], axis=1)
X_test = pd.concat([num_imputed_df_test, encoded_df_test], axis=1)
y = df['Price'].values

In [17]:
X.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 300000 entries, 0 to 299999
Data columns (total 27 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Compartments            300000 non-null  float64
 1   Weight Capacity (kg)    300000 non-null  float64
 2   Brand_Adidas            300000 non-null  float64
 3   Brand_Jansport          300000 non-null  float64
 4   Brand_Nike              300000 non-null  float64
 5   Brand_Puma              300000 non-null  float64
 6   Brand_Under Armour      300000 non-null  float64
 7   Material_Canvas         300000 non-null  float64
 8   Material_Leather        300000 non-null  float64
 9   Material_Nylon          300000 non-null  float64
 10  Material_Polyester      300000 non-null  float64
 11  Size_Large              300000 non-null  float64
 12  Size_Medium             300000 non-null  float64
 13  Size_Small              300000 non-null  float64
 14  Laptop Compartment_No   3

In [18]:
df.shape, X.shape, df_test.shape, X_test.shape

((300000, 10), (300000, 27), (200000, 9), (200000, 27))

### Machine Learning Algo: XGBRegressor and Cross Validation 

In [19]:
y_preds = np.zeros(len(X_test))
scores = []

n_splits = 5
folds = KFold(n_splits = n_splits, shuffle = True)
for fold, (trn_id, val_id) in enumerate(folds.split(X,y)):
    X_train, X_valid = X.iloc[trn_id], X.iloc[val_id]
    y_train, y_valid = y[trn_id], y[val_id]

    xgb_model = XGBRegressor(n_estimators=100, 
                             max_depth=6, learning_rate=0.01,
                             subsample=0.8, colsample_bytree=0.8,
                             objective='reg:squarederror',
                             random_state=42)

    xgb_model.fit(X_train, y_train)
    preds = xgb_model.predict(X_valid)

    score = mean_squared_error(y_valid, preds, squared = False)

    print(f'Fold #: {fold}, Score: {score}')
    scores.append(score)

    y_preds += xgb_model.predict(X_test) / n_splits

print(f'Mean Score: {np.mean(scores)}')

Fold #: 0, Score: 39.13922175502479
Fold #: 1, Score: 39.05622089710218
Fold #: 2, Score: 38.97106395413983
Fold #: 3, Score: 38.97733060372766
Fold #: 4, Score: 38.95736171315927
Mean Score: 39.020239784630746


### Submission

In [20]:
submission.head()

Unnamed: 0,id,Price
0,300000,81.411
1,300001,81.411
2,300002,81.411
3,300003,81.411
4,300004,81.411


In [21]:
submission.shape, y_preds.shape

((200000, 2), (200000,))

In [22]:
submission['Price'] = y_preds
submission.to_csv('submission.csv', index = False)