In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load dataset
shoe_prices_data = pd.read_csv('C:/Users/Musti Tanvir/PycharmProjects/AdidasPrediction/dataset/Shoe prices.csv')

# Convert 'Price (USD)' to numeric
shoe_prices_data['Price (USD)'] = shoe_prices_data['Price (USD)'].replace('[\$,]', '', regex=True).astype(float)

# One-hot encode categorical variables
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(shoe_prices_data[['Brand', 'Type', 'Gender', 'Material']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Combine encoded features with price
final_data = pd.concat([encoded_df, shoe_prices_data['Price (USD)']], axis=1)




In [22]:
# Define features and target
X = final_data.drop('Price (USD)', axis=1)
y = final_data['Price (USD)']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [23]:
# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')


RMSE: 19.675373859799166


In [24]:
# Feature importances
importances = model.feature_importances_
feature_names = X.columns
feature_importances = pd.Series(importances, index=feature_names)
print(feature_importances.sort_values(ascending=False))


Type_Running                     0.290456
Material_Primeknit               0.227638
Brand_Skechers                   0.054500
Type_Casual                      0.051050
Brand_Reebok                     0.049615
                                   ...   
Material_Flexweave/Knit          0.000000
Material_Flexweave/Cushioning    0.000000
Material_Flexweave               0.000000
Type_Racing                      0.000000
Material_Textile/Leather         0.000000
Length: 64, dtype: float64


In [25]:
import pandas as pd

# Convert 'Price (USD)' to numeric
shoe_prices_data['Price (USD)'] = shoe_prices_data['Price (USD)'].replace('[\$,]', '', regex=True).astype(float)

# Aggregate data by Type
type_aggregated = shoe_prices_data.groupby('Type').agg({
    'Price (USD)': 'mean',
    'Brand': lambda x: x.value_counts().idxmax(),
    'Material': lambda x: x.value_counts().idxmax()
}).reset_index()

# Display the results
print(type_aggregated)


              Type  Price (USD)        Brand           Material
0       Basketball   112.692308         Puma            Leather
1           Casual    79.938230     Converse             Canvas
2   Cross-training   130.000000       Reebok     Mesh/Synthetic
3         CrossFit   130.000000       Reebok               Mesh
4         Crossfit   130.000000       Reebok               Mesh
5          Fashion    83.895349         Fila  Leather/Synthetic
6           Hiking    77.500000         Fila  Leather/Synthetic
7        Lifestyle   122.828947       Adidas          Primeknit
8           Racing   110.000000  New Balance               Mesh
9            Retro    90.000000       Reebok         Suede/Mesh
10         Running   129.081325        Asics               Mesh
11           Skate    62.200000         Vans             Canvas
12          Slides    31.666667       Adidas          Synthetic
13           Trail    99.166667        Asics               Mesh
14   Trail Running   113.333333  New Bal