In [None]:
#Import libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np



# Features and target
features = ['price_num','discount_pct','ratings','reviews_num','price_to_mrp_ratio','log_reviews']
X = df[features].fillna(0)
y = df['popularity_score'].fillna(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
top_brands_count = df['brand_name'].value_counts().head(15)

plt.figure(figsize=(12,5))
top_brands_count.plot(kind='bar', color='skyblue')
plt.title("Top 15 Brands by Number of Products")
plt.xlabel("Brand")
plt.ylabel("Number of Products")
plt.xticks(rotation=90)
plt.show()

In [None]:
brand_summary = df.groupby('brand_name').agg(
    n_products=('pants_description','count'),
    avg_rating=('ratings','mean'),
    avg_discount=('discount_pct','mean'),
    avg_price=('price_num','mean'),
    avg_popularity=('popularity_score','mean'),
    rating_variance=('ratings','std')
).reset_index()

brand_summary.sort_values('n_products', ascending=False).head(10)

top_brands_popularity = brand_summary.sort_values('avg_popularity', ascending=False).head(15)

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(data=top_brands_popularity, x='brand_name', y='avg_popularity', palette="viridis")
plt.title("Top 15 Brands by Average Popularity")
plt.xticks(rotation=90)
plt.ylabel("Average Popularity Score")
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=brand_summary, x='avg_discount', y='avg_rating', size='n_products', alpha=0.6, legend=False)
plt.title("Brand-Level Discount vs Rating")
plt.xlabel("Average Discount %")
plt.ylabel("Average Rating")
plt.show()

In [None]:
consistent_brands = brand_summary.sort_values('rating_variance').head(10)
consistent_brands[['brand_name','avg_rating','rating_variance','n_products']]

In [None]:
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Metrics
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))

In [None]:
importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=True)

plt.figure(figsize=(7,5))
importances.plot(kind='barh', color='teal')
plt.title("Feature Importance for Popularity Score")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()
