### Step 0: Imports and Reading Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.ticker as mtick
import matplotlib.lines as lines
import matplotlib.image as mpimg
import matplotlib.colors as mcolors
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
import kagglehub
path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset")
print("Path to dataset files:", path)

In [None]:
dataFrame = pd.read_csv("/Users/kaancakir/data/analysis/amazonAnalysis/amazon.csv")

### Step 0.1: Design the Charts

In [None]:
def chart_styling(ax, fig, title, bars, logo_path='logo.png'):
    fig.patch.set_facecolor('#D3D3D3')
    ax.set_facecolor('#D3D3D3')

    fig.text(0.09, 1.05, title, fontsize=18, fontweight='bold', fontfamily='serif')

    logo = mpimg.imread(logo_path)
    logo_ax = fig.add_axes([0.85, 0.85, 0.1, 0.1])
    logo_ax.imshow(logo)
    logo_ax.axis('off')

    ax.grid(axis='y', linestyle='-', alpha=0.2)
    ax.set_axisbelow(True)

    for spine in ['top', 'right', 'left']:
        ax.spines[spine].set_visible(False)

    ax.tick_params(axis='both', which='major', labelsize=12)

    l1 = lines.Line2D([1, 1], [0, 1], transform=fig.transFigure, figure=fig, color='black', lw=0.2)
    fig.lines.extend([l1])

    for bar in bars:
        bar.set_color('#008080') 
        
    for bar in bars:
        bar.set_edgecolor('#000000')
        bar.set_linewidth(0.5)

    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

#### Pie Charts Design

In [None]:
def pie_chart_styling(values, labels, title, figsize=(18, 10)):
    fig, ax = plt.subplots(figsize=figsize)
    
    # Calculate percentages
    total = sum(values)
    percentages = [v/total*100 for v in values]
    
    # Create pie chart
    wedges, texts, autotexts = ax.pie(values,
                                     labels=labels,
                                     autopct='%1.1f%%',
                                     startangle=140,
                                     colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
    
    # Style the chart
    fig.patch.set_facecolor('#f0f0f0')
    ax.set_facecolor('#f0f0f0')
    
    # Add title
    fig.text(0.5, 0.95, title, ha='center', fontsize=18, fontweight='bold', fontfamily='serif')
    
    # Style the text
    for autotext in autotexts:
        autotext.set_color('black')
        autotext.set_fontsize(12)
        autotext.set_fontweight('bold')
    
    # Add legend
    ax.legend([f'{l} ({p:.1f}%)' for l, p in zip(labels, percentages)], 
             loc='center left', 
             bbox_to_anchor=(1, 0.5), 
             fontsize=12)
    
    plt.show()


### Step 1: Data Understanding


In [None]:
print(dataFrame.shape[0])
print(dataFrame.shape[1])

In [None]:
dataFrame.duplicated().sum()

In [None]:
dataFrame.isnull().sum()


In [None]:
dataFrame.dtypes

In [None]:
dataFrame.describe()

In [None]:
missingData = dataFrame.isnull().sum().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(18, 10))

bars = ax.bar(missingData.index, missingData.values)

chart_styling(fig= fig, ax = ax, bars= bars, title = "Missing Values")

plt.show()

In [None]:
for column in dataFrame.columns:
    print(f"{column} : {len(dataFrame[column].unique())}")

### Step 2: Data Preperation


In [None]:
df = dataFrame[['product_id',
               'product_name',
               'category',
               'discounted_price',
               'actual_price',
               'discount_percentage',
               'rating',
               'rating_count',
               ]]

#### Fill the NULL values with the 0 values.

In [None]:
df['rating_count'] = df['rating_count'].fillna(0)

#### Convert Object Values To The Numeric Values

In [None]:
df['discounted_price'] = pd.to_numeric(df['discounted_price'].str.replace('₹', '').str.replace(',', ''), errors='coerce').fillna(0).astype(int)
df['actual_price'] = pd.to_numeric(df['actual_price'].str.replace('₹', '').str.replace(',', ''), errors='coerce').fillna(0).astype(int)
df['discount_percentage'] = pd.to_numeric(df['discount_percentage'].str.replace('%', ''), errors='coerce').fillna(0).astype(float)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0).astype(float)
df['rating_count'] = pd.to_numeric(df['rating_count'].str.replace(',', ''), errors='coerce').fillna(0).astype(int)

#### Seperate The Categories For The Later Usage

In [None]:
categories = dataFrame['category'].str.split('|').explode().unique()
print(categories)

## Step 3: Features


#### Count of Orders By Category

In [None]:
dfExploded = df.assign(category=df['category'].str.split('|')).explode('category')

topCategories = dfExploded['category'].value_counts().head(10)

fig, ax = plt.subplots(figsize=(18, 10))
bars = sns.barplot(x=topCategories.index, y=topCategories.values, ax=ax)

chart_styling(ax=ax, fig=fig, title="Count of Orders By Category", bars=bars.patches)

plt.show()


#### Distribution of Ratings

In [None]:
ratingZeroToOne = df.query('0 < rating < 1').index
ratingOneToTwo = df.query('1 <= rating < 2').index
ratingTwoToThree = df.query('2 <= rating < 3').index
ratingThreeToFour = df.query('3 <= rating < 4').index
ratingFourToFive = df.query('4 <= rating <= 5').index

values = [len(ratingZeroToOne), len(ratingOneToTwo), len(ratingTwoToThree), 
          len(ratingThreeToFour), len(ratingFourToFive)]
labels = ['0-1', '1-2', '2-3', '3-4', '4-5']

pie_chart_styling(values=values, labels=labels, title="Distribution of Ratings")



#### Distribution Of Discount Percentages

In [None]:
discountZeroToTwentyFive = df.query('0 < discount_percentage < 25')
discountTwentyFiveToFifty = df.query('25 <= discount_percentage < 50')
discountFiftyToSeventyFive = df.query('50 <= discount_percentage < 75')
discountSeventyFiveToOneHundred = df.query('75 <= discount_percentage <= 100')

values = [len(discountZeroToTwentyFive), len(discountTwentyFiveToFifty), 
          len(discountFiftyToSeventyFive), len(discountSeventyFiveToOneHundred)]
labels = ['0-25%', '25-50%', '50-75%', '75-100%']

pie_chart_styling(values=values, labels=labels, title="Distribution of Discount Percentages")


#### Distribution Of Actual Prices

In [None]:
m = df['actual_price']

fig, ax = plt.subplots(figsize=(18, 10))

bars = sns.histplot(x= m, ax= ax, bins= 30, kde= True)

chart_styling(ax=ax, fig=fig, title="Distribution Of Actual Prices", bars=bars.patches)

plt.show()

In [None]:
numericColumns