In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.model_selection import train_test_split

In [5]:
df1 = pd.read_csv(r"train.csv")
df2 = pd.read_csv(r"training_extra.csv")
df = pd.concat([df1, df2], ignore_index=True)
print(df.head(10))

   id         Brand   Material    Size  Compartments Laptop Compartment  \
0   0      Jansport    Leather  Medium           7.0                Yes   
1   1      Jansport     Canvas   Small          10.0                Yes   
2   2  Under Armour    Leather   Small           2.0                Yes   
3   3          Nike      Nylon   Small           8.0                Yes   
4   4        Adidas     Canvas  Medium           1.0                Yes   
5   5          Nike     Canvas  Medium          10.0                 No   
6   6          Nike        NaN   Large           3.0                 No   
7   7          Puma     Canvas   Small           1.0                Yes   
8   8  Under Armour  Polyester  Medium           8.0                Yes   
9   9  Under Armour      Nylon  Medium           2.0                Yes   

  Waterproof      Style  Color  Weight Capacity (kg)      Price  
0         No       Tote  Black             11.611723  112.15875  
1        Yes  Messenger  Green            

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3994318 entries, 0 to 3994317
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Brand                 object 
 2   Material              object 
 3   Size                  object 
 4   Compartments          float64
 5   Laptop Compartment    object 
 6   Waterproof            object 
 7   Style                 object 
 8   Color                 object 
 9   Weight Capacity (kg)  float64
 10  Price                 float64
dtypes: float64(3), int64(1), object(7)
memory usage: 335.2+ MB


In [7]:
df.isnull().sum()

Unnamed: 0,0
id,0
Brand,126758
Material,110962
Size,87785
Compartments,0
Laptop Compartment,98533
Waterproof,94324
Style,104180
Color,133617
Weight Capacity (kg),1808


In [8]:
df.shape

(3994318, 11)

In [9]:
#removing Null Values
for col in ["Brand", "Material", "Size", "Compartments", "Laptop Compartment",
            "Waterproof", "Style", "Color"]:
    df[col] = df[col].fillna(df[col].mode()[0])

# Fill numerical columns with mean
df["Weight Capacity (kg)"] = df["Weight Capacity (kg)"].fillna(df["Weight Capacity (kg)"].mean())

In [10]:
df.isnull().sum()


Unnamed: 0,0
id,0
Brand,0
Material,0
Size,0
Compartments,0
Laptop Compartment,0
Waterproof,0
Style,0
Color,0
Weight Capacity (kg),0


In [11]:
df.head(10) #for confirming null values

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
5,5,Nike,Canvas,Medium,10.0,No,Yes,Messenger,Black,7.241812,20.01553
6,6,Nike,Polyester,Large,3.0,No,No,Backpack,Green,6.828123,84.805
7,7,Puma,Canvas,Small,1.0,Yes,Yes,Backpack,Blue,21.488864,27.15815
8,8,Under Armour,Polyester,Medium,8.0,Yes,No,Tote,Gray,10.20778,25.98652
9,9,Under Armour,Nylon,Medium,2.0,Yes,Yes,Messenger,Pink,15.8951,38.48741


In [19]:
X = df[['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
             'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)']]

y = df['Price']

In [20]:
# Encode categorical columns
categorical_cols = ["Brand", "Material", "Size", "Compartments", "Laptop Compartment",
                    "Waterproof", "Style", "Color"]

In [34]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in categorical_cols:
    X.loc[:, col] = encoder.fit_transform(X[col])

X.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,1,1,1,6.0,1,0,2,0,11.611723
1,1,0,2,9.0,1,1,1,3,27.078537
2,4,1,2,1.0,1,0,1,5,16.64376
3,2,2,2,7.0,1,0,1,3,12.93722
4,0,0,1,0.0,1,1,1,3,17.749338


In [159]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [160]:
from sklearn.feature_selection import mutual_info_regression

# # Sample only 5000 instances to speed up computation

num_runs = 10
feature_sets = []

for _ in range(num_runs):
    X_sample = X_train.sample(n=5000, random_state=42)  # Sample with fixed seed
    y_sample = y_train.loc[X_sample.index]

    importance = mutual_info_regression(X_sample, y_sample)
    feature_sets.append(set(X_sample.columns[importance > 0.01]))

from collections import Counter

# Count occurrences of each feature across all runs
feature_counts = Counter(feature for features in feature_sets for feature in features)

# Keep features that appeared in 3 or more runs
final_features = {feature for feature, count in feature_counts.items() if count >=4}

# Select these features from the full dataset
selected_features = list(final_features)

print(selected_features)


#print(X_selected)


['Brand']


In [140]:
X_train = X[selected_features]
y_train = y


In [124]:
# Initialize and train the model
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)


In [125]:
#imported testing data
df_test=pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [126]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    200000 non-null  int64  
 1   Brand                 193773 non-null  object 
 2   Material              194387 non-null  object 
 3   Size                  195619 non-null  object 
 4   Compartments          200000 non-null  float64
 5   Laptop Compartment    195038 non-null  object 
 6   Waterproof            195189 non-null  object 
 7   Style                 194847 non-null  object 
 8   Color                 193215 non-null  object 
 9   Weight Capacity (kg)  199923 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 15.3+ MB


In [127]:
#performing all steps of preprocessing that we did in train data
#Replacing NULL Values with mode(most frequent)
#removing Null Values
for col in ["Brand", "Material", "Size", "Compartments", "Laptop Compartment",
            "Waterproof", "Style", "Color"]:
    df_test[col] = df_test[col].fillna(df_test[col].mode()[0])

# Fill numerical columns with mean
df_test["Weight Capacity (kg)"] = df_test["Weight Capacity (kg)"].fillna(df_test["Weight Capacity (kg)"].median())

In [128]:
df_test.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,Adidas,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [129]:

for col in categorical_cols:
    df_test.loc[:, col] = encoder.fit_transform(df_test[col])

In [130]:
df_test.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,3,1,2,1.0,0,0,2,3,20.671147
1,300001,2,0,1,6.0,0,1,0,3,13.564105
2,300002,0,0,0,8.0,0,1,1,1,11.809799
3,300003,0,2,0,0.0,1,0,1,3,18.477036
4,300004,0,2,0,1.0,1,1,2,0,9.907953


In [131]:
X_test = df_test[selected_features]

In [132]:
y_pred = regressor.predict(X_test)


In [133]:
submission_df = pd.DataFrame({'id': df_test['id'], 'Price': y_pred})
submission_df.head(10)


Unnamed: 0,id,Price
0,300000,81.27346
1,300001,81.211183
2,300002,80.538414
3,300003,80.517768
4,300004,80.517768
5,300005,81.940799
6,300006,81.211183
7,300007,81.846215
8,300008,81.353299
9,300009,81.27346


In [134]:
submission_df.to_csv('submission.csv', index=False)