### Image display

In [None]:
from IPython.display import Image
Image('download.png')

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load dataset

In [None]:
df=pd.read_csv('/content/BigBasket Products.csv')
df

### Top 12 products

In [None]:
df['product'].value_counts().head(12)

### Data description

In [None]:
df.describe()

### Data info

In [None]:
df.info()

### Top 5 selling products

In [None]:
df['product'].value_counts().head(5)

### Least 5 selling products

In [None]:
df['product'].value_counts().tail(5)

### Calculate discount

In [None]:
df['Discount_percentage'] = (((df['market_price'] - df['sale_price']) / df['market_price']) * 100).round(2)
df

### Missing values

In [None]:
df.isnull().sum()

### Missing product values

In [None]:
df[df['product'].isnull()]

### Missing brand values

In [None]:
df[df['brand'].isnull()]

### Missing sale_price values

In [None]:
df[df['sale_price'].isnull()]

### Missing rating values

In [None]:
df[df['rating'].isnull()]

### Missing description values

In [None]:
df[df['description'].isnull()]

### Missing Discount_percentage values

In [None]:
df[df['Discount_percentage'].isnull()]

### Clean missing values

In [None]:
df['product']=df['product'].fillna(df['product'].mode()[0])
df['sale_price']=df['sale_price'].fillna(df['sale_price'].median())
df['brand']=df['brand'].fillna(df['brand'].mode()[0])
df['rating']=df['rating'].fillna(df['rating'].median())
df['description']=df['description'].fillna(df['description'].mode()[0])
df['Discount_percentage']=df['Discount_percentage'].fillna(df['Discount_percentage'].median())
df.isnull().sum()

### Select numeric columns

In [None]:
numeric = df.select_dtypes(include=['int', 'float'])
numeric

### Function to count outliers

In [None]:
def select_outliers(data):
    q1 = data.quantile(.25)
    q3 = data.quantile(.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr
    outliers = data[(data < lower) | (data > upper)]
    return outliers.count()

### Count outliers

In [None]:
numeric_colm = ['sale_price', 'market_price', 'rating', 'Discount_percentage']
outliers_count = {col: select_outliers(df[col]) for col in numeric_colm}
outliers_count

### Function to replace outliers

In [None]:
def replace_outliers(df, colm):
    q1 = df[colm].quantile(.25)
    q3 = df[colm].quantile(.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr
    df.loc[(df[colm] < lower) | (df[colm] > upper), colm] = df[colm].mean()
    return df

### Apply outlier replacement

In [None]:
df = replace_outliers(df, 'sale_price')
df = replace_outliers(df, 'market_price')
df = replace_outliers(df, 'rating')
df = replace_outliers(df, 'Discount_percentage')

### Boxplots after cleaning

In [None]:
sns.boxplot(data=df, x='sale_price')
plt.show()
sns.boxplot(data=df, x='market_price')
plt.show()
sns.boxplot(data=df, x='rating')
plt.show()
sns.boxplot(data=df, x='Discount_percentage')
plt.show()

### Category counts

In [None]:
df['category'].value_counts()

### Bar plot per category

In [None]:
data = df['category'].value_counts()
sns.barplot(x=data.index, y=data.values, color='skyblue')
plt.xticks(rotation=75)
plt.xlabel('Category')
plt.ylabel('No. of Products')
plt.title('No. of Products per Category')
plt.tight_layout()
plt.show()

### Histograms

In [None]:
df[['sale_price','market_price','rating','Discount_percentage']].hist(bins=15, figsize=(10,6), grid=False, rwidth=0.8, color='purple')
plt.show()

### Top 10 selling products

In [None]:
counts = df['product'].value_counts()
counts_p = pd.DataFrame({'product': counts.index, 'Counts': counts.values})[:10]
bx = sns.barplot(x='product', y='Counts', data=counts_p, color='brown')
plt.xticks(rotation=75, ha='right')
bx.set_xlabel('Products')
bx.set_ylabel('No. of Products')
bx.set_title('Top 10 selling products by Volume')
for p in bx.patches:
    bx.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='baseline', fontsize=10, color='black', xytext=(0, 4), textcoords='offset points')
plt.tight_layout()
plt.show()

### Top 5 selling brands

In [None]:
counts = df['brand'].value_counts()
counts_b = pd.DataFrame({'brand': counts.index, 'Counts': counts.values})[:5]
cx = sns.barplot(x='brand', y='Counts', data=counts_b, color='lightblue')
cx.set_xlabel('Brand')
cx.set_title('Top 5 selling brands by Volume')
for p in cx.patches:
    cx.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='baseline', fontsize=10, color='black', xytext=(0, 4), textcoords='offset points')
plt.tight_layout()
plt.show()

### Top 5 rated products

In [None]:
top_5_products = df.nlargest(7, 'rating')[['product','rating']]
plt.bar(top_5_products['product'], top_5_products['rating'], color='#2B7A78')
plt.xlabel('Product')
plt.ylabel('Rating')
plt.title('Top 5 Products by Rating')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Least 5 rated products

In [None]:
top_5_products = df.nsmallest(7, 'rating')[['product','rating']]
plt.bar(top_5_products['product'], top_5_products['rating'], color='lightgreen')
plt.xlabel('Product')
plt.ylabel('Rating')
plt.title('Least 5 Products by Rating')
plt.xticks(rotation=45, ha='right')
plt.show()