In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv


**Import All Libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

*Import train and test datasets *

In [3]:
train = pd.read_csv("/kaggle/input/playground-series-s5e2/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e2/test.csv")


Check for missing values

In [4]:
print(train.isnull().sum())
print(test.isnull().sum())

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64
id                         0
Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64


In [5]:
print(train.describe())
print(train.describe(include='all'))


                  id   Compartments  Weight Capacity (kg)          Price
count  300000.000000  300000.000000         299862.000000  300000.000000
mean   149999.500000       5.443590             18.029994      81.411107
std     86602.684716       2.890766              6.966914      39.039340
min         0.000000       1.000000              5.000000      15.000000
25%     74999.750000       3.000000             12.097867      47.384620
50%    149999.500000       5.000000             18.068614      80.956120
75%    224999.250000       8.000000             24.002375     115.018160
max    299999.000000      10.000000             30.000000     150.000000
                   id   Brand   Material    Size   Compartments  \
count   300000.000000  290295     291653  293405  300000.000000   
unique            NaN       5          4       3            NaN   
top               NaN  Adidas  Polyester  Medium            NaN   
freq              NaN   60077      79630  101906            NaN   
mean    

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Changing Categorical columns into Numerical columns for better ML validation

In [6]:
# Fill missing values with mode using apply
for col in ['Laptop Compartment', 'Waterproof']:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])


In [7]:
def clean_and_transform(df):
    # Fill missing categorical values
    df.fillna({
        'Brand': 'Unknown',
        'Material': 'Unknown',
        'Size': 'Unknown',
        'Style': 'Unknown',
        'Color': 'Unknown'
    }, inplace=True)
    
    # Fill missing numerical values
    df.fillna({
        
        'Weight Capacity (kg)': df['Weight Capacity (kg)'].median(),
        
    }, inplace=True)
    
    # Encode binary categorical columns
    df['Laptop Compartment'] = df['Laptop Compartment'].map({'Yes': 1, 'No': 0})
    df['Waterproof'] = df['Waterproof'].map({'Yes': 1, 'No': 0})
    
    return df

Filling the missing values in binary features with mode

In [8]:
# Clean train and test datasets
train = clean_and_transform(train)
test = clean_and_transform(test)

categorical_cols = ['Brand', 'Material', 'Size', 'Style', 'Color']
le_dict = {}

for col in categorical_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    
    # Handle unseen categories in test data
    test[col] = test[col].apply(lambda x: x if x in le.classes_ else 'Unknown')
    test[col] = le.transform(test[col])

# Check results
print(train.info())
print(test.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 300000 non-null  int64  
 2   Material              300000 non-null  int64  
 3   Size                  300000 non-null  int64  
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    300000 non-null  int64  
 6   Waterproof            300000 non-null  int64  
 7   Style                 300000 non-null  int64  
 8   Color                 300000 non-null  int64  
 9   Weight Capacity (kg)  300000 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(8)
memory usage: 25.2 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dt

Prepare Training and Testing data

In [9]:
print(train.describe())
print(test.describe())

                  id          Brand       Material           Size  \
count  300000.000000  300000.000000  300000.000000  300000.000000   
mean   149999.500000       2.098943       1.623000       1.024677   
std     86602.684716       1.502860       1.177303       0.851855   
min         0.000000       0.000000       0.000000       0.000000   
25%     74999.750000       1.000000       1.000000       0.000000   
50%    149999.500000       2.000000       2.000000       1.000000   
75%    224999.250000       3.000000       3.000000       2.000000   
max    299999.000000       5.000000       4.000000       3.000000   

        Compartments  Laptop Compartment     Waterproof          Style  \
count  300000.000000       300000.000000  300000.000000  300000.000000   
mean        5.443590            0.519287       0.517090       1.062723   
std         2.890766            0.499629       0.499709       0.861587   
min         1.000000            0.000000       0.000000       0.000000   
25%     

In [10]:
X_train = train.drop(columns=['id', 'Price'])  # Features (excluding target and ID)
y_train = train['Price']  # Target variable (Price)


X_test = test.drop(columns=['id'])  # Features (test set)


In [11]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test data
predictions = model.predict(X_test)

In [12]:
# Create the submission DataFrame
submission = pd.DataFrame({'id': test['id'], 'Price': predictions})

# Save the submission file
submission.to_csv('submission.csv', index=False)