### Step 0: Imports and Reading Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.ticker as mtick
import matplotlib.lines as lines
import matplotlib.image as mpimg
import matplotlib.colors as mcolors
import seaborn as sns

In [None]:
import kagglehub
path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset")
print("Path to dataset files:", path)

In [None]:
dataFrame = pd.read_csv("/Users/kaancakir/data/analysis/amazonAnalysis/amazon.csv")

### Step 0.1: Design the Charts

In [None]:
def chart_styling(ax, fig, title, bars, logo_path='logo.png'):
    fig.patch.set_facecolor('#D3D3D3')
    ax.set_facecolor('#D3D3D3')

    fig.text(0.09, 1.05, title, fontsize=18, fontweight='bold', fontfamily='serif')

    logo = mpimg.imread(logo_path)
    logo_ax = fig.add_axes([0.85, 0.85, 0.1, 0.1])
    logo_ax.imshow(logo)
    logo_ax.axis('off')

    ax.grid(axis='y', linestyle='-', alpha=0.2)
    ax.set_axisbelow(True)

    for spine in ['top', 'right', 'left']:
        ax.spines[spine].set_visible(False)

    ax.tick_params(axis='both', which='major', labelsize=12)

    l1 = lines.Line2D([1, 1], [0, 1], transform=fig.transFigure, figure=fig, color='black', lw=0.2)
    fig.lines.extend([l1])

    for bar in bars:
        bar.set_color('#008080') 
        
    for bar in bars:
        bar.set_edgecolor('#000000')
        bar.set_linewidth(0.5)

    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

### Step 1: Data Understanding


In [None]:
print(dataFrame.shape[0])
print(dataFrame.shape[1])

In [None]:
dataFrame.duplicated().sum()

In [None]:
dataFrame.isnull().sum()


In [None]:
dataFrame.dtypes

In [None]:
dataFrame.describe()

In [None]:
missingData = dataFrame.isnull().sum().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(18, 10))

bars = ax.bar(missingData.index, missingData.values)

chart_styling(fig= fig, ax = ax, bars= bars, title = "Missing Values")

plt.show()

In [None]:
for column in dataFrame.columns:
    print(f"{column} : {len(dataFrame[column].unique())}")

### Step 2: Data Preperation


In [None]:
df = dataFrame[['product_id',
               'product_name',
               'category',
               'discounted_price',
               'actual_price',
               'discount_percentage',
               'rating',
               'rating_count',
               'user_id',
               'review_id'
               ]]

#### Fill the NULL values with the 0 values.

In [None]:
df['rating_count'] = df['rating_count'].fillna(0)

#### Convert Object Values To The Numeric Values

In [None]:
df['discounted_price'] = pd.to_numeric(df['discounted_price'].str.replace('₹', '').str.replace(',', ''), errors='coerce').fillna(0).astype(int)
df['actual_price'] = pd.to_numeric(df['actual_price'].str.replace('₹', '').str.replace(',', ''), errors='coerce').fillna(0).astype(int)
df['discount_percentage'] = pd.to_numeric(df['discount_percentage'].str.replace('%', ''), errors='coerce').fillna(0).astype(float)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0).astype(float)
df['rating_count'] = pd.to_numeric(df['rating_count'].str.replace(',', ''), errors='coerce').fillna(0).astype(int)

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   product_id           1465 non-null   object 
 1   product_name         1465 non-null   object 
 2   category             1465 non-null   object 
 3   discounted_price     1465 non-null   int64  
 4   actual_price         1465 non-null   int64  
 5   discount_percentage  1465 non-null   float64
 6   rating               1465 non-null   float64
 7   rating_count         1465 non-null   int64  
 8   user_id              1465 non-null   object 
 9   review_id            1465 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 114.6+ KB
