Online Sales Data Analysis

Orders.csv has : Order ID (primary key), Order Date, CustomerName, State, City ->Customer details.

Details.csv contains: Order ID (primary key) and the details of all the subsequent orders within the Orders.csv ->Order details.

In [1]:
#Libraries
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [2]:
path = '/Users/josephzhuo/anaconda_projects/eg-online-sales-analysis'

details = pd.read_csv(path + '/Details.csv')
orders = pd.read_csv(path + '/Orders.csv')

In [3]:
details.head()

Unnamed: 0,Order ID,Amount,Profit,Quantity,Category,Sub-Category,PaymentMode
0,B-25681,1096,658,7,Electronics,Electronic Games,COD
1,B-26055,5729,64,14,Furniture,Chairs,EMI
2,B-25955,2927,146,8,Furniture,Bookcases,EMI
3,B-26093,2847,712,8,Electronics,Printers,Credit Card
4,B-25602,2617,1151,4,Electronics,Phones,Credit Card


In [4]:
orders.head()


Unnamed: 0,Order ID,Order Date,CustomerName,State,City
0,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura
1,B-25993,03-02-2018,Madhav,Delhi,Delhi
2,B-25973,24-01-2018,Madan Mohan,Uttar Pradesh,Mathura
3,B-25923,27-12-2018,Gopal,Maharashtra,Mumbai
4,B-25757,21-08-2018,Vishakha,Madhya Pradesh,Indore


In [5]:
# Checking null values
def check_null_values(dataframes):
    print("=[ Checking NULL Values ]=")
    for name, df in dataframes.items():
        print(f"\nNull values in {name}:\n{df.isna().sum()}\n")

# Checking unique values
def check_unique_values(dataframes, cat_columns_dict):
    print("=[ Checking UNIQUE Values ]=\n")
    for name, df in dataframes.items():
        cat_columns = cat_columns_dict[name]
        for col in cat_columns:
            unique_values = df[col].unique()
            print(f"Unique values in '{col}' ({name}):\n{unique_values}\n")

In [6]:
# Remove unecessary categorical columns to find unique values (eg. ID)
details_cat_columns = details.select_dtypes(
                                            object
                                        ).drop(
                                            'Order ID', axis = 1
                                        ).columns

orders_cat_columns = orders.select_dtypes(
                                            object
                                        ).drop(
                                            ['Order ID', 'Order Date'], axis = 1
                                        ).columns

# Dataframe Dictionary
dataframes = {
    'Details Dataframe' : details,
    'Orders Dataframe' : orders
}

# Store categorical columns separately
cat_columns_dict = {
    'Details Dataframe': details_cat_columns,
    'Orders Dataframe': orders_cat_columns
}


In [7]:
check_null_values(dataframes)

=[ Checking NULL Values ]=

Null values in Details Dataframe:
Order ID        0
Amount          0
Profit          0
Quantity        0
Category        0
Sub-Category    0
PaymentMode     0
dtype: int64


Null values in Orders Dataframe:
Order ID        0
Order Date      0
CustomerName    0
State           0
City            0
dtype: int64



In [8]:
check_unique_values(dataframes, cat_columns_dict)

=[ Checking UNIQUE Values ]=

Unique values in 'Category' (Details Dataframe):
['Electronics' 'Furniture' 'Clothing']

Unique values in 'Sub-Category' (Details Dataframe):
['Electronic Games' 'Chairs' 'Bookcases' 'Printers' 'Phones' 'Trousers'
 'Saree' 'Hankerchief' 'Kurti' 'Skirt' 'Tables' 'Stole' 'Leggings'
 'Accessories' 'T-shirt' 'Furnishings' 'Shirt']

Unique values in 'PaymentMode' (Details Dataframe):
['COD' 'EMI' 'Credit Card' 'UPI' 'Debit Card']

Unique values in 'CustomerName' (Orders Dataframe):
['Harivansh' 'Madhav' 'Madan Mohan' 'Gopal' 'Vishakha' 'Sudevi' 'Shiva'
 'Sarita' 'Shishu' 'Vrinda' 'Uudhav' 'Shreyshi' 'Bhishm' 'Yogesh' 'Lalita'
 'Rohan' 'Gaurav' 'Amol' 'Aastha' 'Pooja' 'Shrichand' 'Hitesh' 'Shourya'
 'Ishpreet' 'Sudhir' 'Sauptik' 'Lisha' 'Bhawna' 'Sujay' 'Jay' 'Shruti'
 'Mohan' 'Neha' 'Shreyoshe' 'Hemant' 'Madhulika' 'Shardul' 'Sheetal'
 'Pournamasi' 'Surabhi' 'Manshul' 'Parth' 'Siddharth' 'Priyanka' 'Aarushi'
 'Vaibhav' 'Savi' 'Jahan' 'Parishi' 'Farah' 'Abhijeet

In [9]:
# Checking numerical details in details Dataframe
details.describe()

Unnamed: 0,Amount,Profit,Quantity
count,1500.0,1500.0,1500.0
mean,291.847333,24.642,3.743333
std,461.92462,168.55881,2.184942
min,4.0,-1981.0,1.0
25%,47.75,-12.0,2.0
50%,122.0,8.0,3.0
75%,326.25,38.0,5.0
max,5729.0,1864.0,14.0


In [10]:
# Checking numerical details in orders Dataframe
orders.describe()

Unnamed: 0,Order ID,Order Date,CustomerName,State,City
count,500,500,500,500,500
unique,500,307,336,19,25
top,B-26091,24-11-2018,Shreya,Maharashtra,Indore
freq,1,7,6,94,71


In [11]:
# Left join details to orders (details table is the left) 
# Check the dim of merged table
merged_df = pd.merge(
    details, orders, on = 'Order ID', how = 'left'
)
merged_df.shape

(1500, 11)

In [12]:
# Table loading to PBI
merged_df.to_csv('/Users/josephzhuo/anaconda_projects/eg-online-sales-analysis/data_pbi.csv', index=False)

# To check if the file was created successfully
if os.path.exists('data_pbi.csv'):
    print("CSV file created successfully!")

CSV file created successfully!
