In [20]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

import io
import itertools
from decimal import Decimal 

from scipy.stats import anderson
from scipy.stats import shapiro
from scipy.stats import zscore
from scipy.stats import mannwhitneyu
from scipy.stats import linregress

import warnings

### loading data

In [21]:
df_superstore = pd.read_csv('./superstore_dataset.csv',  encoding='latin-1')

df_superstore.head()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,State,Postal_Code,Region,Product_ID,Category,Sub_Category,Product_Name,Sales
0,1,CA-2017-152156,8/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
1,2,CA-2017-152156,8/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
2,3,CA-2017-138688,12/6/2017,16/06/2017,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
3,4,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O Donnel,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
4,5,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O Donnel,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold N Roll Cart System,22.368


In [22]:
df_superstore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row_ID         9800 non-null   int64  
 1   Order_ID       9800 non-null   object 
 2   Order_Date     9800 non-null   object 
 3   Ship_Date      9800 non-null   object 
 4   Ship_Mode      9800 non-null   object 
 5   Customer_ID    9800 non-null   object 
 6   Customer_Name  9800 non-null   object 
 7   Segment        9800 non-null   object 
 8   Country        9800 non-null   object 
 9   City           9800 non-null   object 
 10  State          9800 non-null   object 
 11  Postal_Code    9789 non-null   float64
 12  Region         9800 non-null   object 
 13  Product_ID     9800 non-null   object 
 14  Category       9800 non-null   object 
 15  Sub_Category   9800 non-null   object 
 16  Product_Name   9800 non-null   object 
 17  Sales          9800 non-null   float64
dtypes: float

In [23]:
# check for NaN

df_superstore.isna().sum()

Row_ID            0
Order_ID          0
Order_Date        0
Ship_Date         0
Ship_Mode         0
Customer_ID       0
Customer_Name     0
Segment           0
Country           0
City              0
State             0
Postal_Code      11
Region            0
Product_ID        0
Category          0
Sub_Category      0
Product_Name      0
Sales             0
dtype: int64

Is it 11 clients or 11 orders without PostalCode?

In [24]:
# check the unique Customer IDs where Postal Code is NaN

df_nan = df_superstore[df_superstore['Postal_Code'].isna()]
df_nan['Customer_ID'].unique()

array(['QJ-19255', 'SV-20785', 'VM-21685', 'CB-12535', 'RM-19375'],
      dtype=object)

Are there other orrders with these IDs where the postal Code is entered?

In [25]:
# check the list of Customer ID if there are different Postal Codes

nan_customer = ['QJ-19255', 'SV-20785', 'VM-21685', 'CB-12535', 'RM-19375']

df_nan = df_superstore[df_superstore['Customer_ID'].isin(nan_customer)]

df_postal_codes = df_nan.groupby('Customer_ID')['Postal_Code'].apply(list)

df_postal_codes

Customer_ID
CB-12535    [27514.0, 44221.0, 94110.0, 2038.0, 2038.0, 19...
QJ-19255    [55106.0, 19143.0, 19143.0, nan, 10009.0, 2215...
RM-19375    [77095.0, 95123.0, 6450.0, 31907.0, 27604.0, n...
SV-20785    [81001.0, 81001.0, 30076.0, 30076.0, 30076.0, ...
VM-21685    [7090.0, 19140.0, 19140.0, 98105.0, 98105.0, 9...
Name: Postal_Code, dtype: object

There are different Postal Codes per Customer ID - so I drop the NaN

In [26]:
# drop Nan

df_superstore.dropna(inplace=True)

## change datatypes

Order_Date - datetime  
Ship_Date - datetime  
Postal_Code - integer  
Sales - decimal

In [None]:
## setting datatypes

# dates
df_superstore['Order_Date'] = pd.to_datetime(df_superstore['Order_Date'], format='%d/%m/%Y')
df_superstore['Ship_Date'] = pd.to_datetime(df_superstore['Ship_Date'], format='%d/%m/%Y')

# postcode as integer
df_superstore['Postal_Code'] = df_superstore['Postal_Code'].astype('int')

# sales with 2 decimals
df_superstore['Sales'] = df_superstore['Sales'].apply(lambda x: Decimal(x).quantize(Decimal('0.01')))


In [30]:
df_superstore.head()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,State,Postal_Code,Region,Product_ID,Category,Sub_Category,Product_Name,Sales
0,1,CA-2017-152156,2017-11-08,2017-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
1,2,CA-2017-152156,2017-11-08,2017-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
2,3,CA-2017-138688,2017-06-12,2017-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
3,4,US-2016-108966,2016-10-11,2016-10-18,Standard Class,SO-20335,Sean O Donnel,Consumer,United States,Fort Lauderdale,Florida,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.58
4,5,US-2016-108966,2016-10-11,2016-10-18,Standard Class,SO-20335,Sean O Donnel,Consumer,United States,Fort Lauderdale,Florida,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold N Roll Cart System,22.37


## EDA

### period of time

In [None]:
# from when to when do we have dates in this dataset?

print(f"The observerd time period is from {df_superstore['Order_Date'].min().strftime('%Y-%m-%d')}" 
      f" until {df_superstore['Order_Date'].max().strftime('%Y-%m-%d')}.")

the observerd time period is from 2015-01-03 until 2018-12-30


### number of customers

In [37]:
# how many unique customers

print(f"The number of distinct customers is {df_superstore['Customer_ID'].nunique()}.")

The number of distinct customers is 793.


### number of orders

In [39]:
# how many order

print(f"The number of orders is {df_superstore['Order_ID'].nunique()}.")

The number of orders is 4916.


### number of products

In [61]:
# how many products

print(f"There are {df_superstore['Product_ID'].nunique():,} different products.")

There are 1,860 different products.


### categories: clients & products

In [None]:
# what customer categories
print(f"The customers are divided into {df_superstore['Segment'].nunique()} segemnts: {df_superstore['Segment'].unique()}.")

# what product categories
print(f"The number of different product categories is {df_superstore['Category'].nunique()}" 
      f": {df_superstore['Category'].unique()}.")

# highest, lowest and average price
print(f"The price range is from {df_superstore['Sales'].min():,}$ to {df_superstore['Sales'].max():,}$" 
      f" with an average of {df_superstore['Sales'].mean():,.2f}$.")


The customers are divided into 3 segemnts: ['Consumer' 'Corporate' 'Home Office'].
The number of different product categories is 3: ['Furniture' 'Office Supplies' 'Technology'].
The price range is from 0.44$ to 22,638.48$ with an average of 230.12$.


### conclusions from orders: prices, amounts, discounts

In [59]:
## how many different products per order and average price

# group by order and count unique product IDs
df_products_pO = df_superstore.groupby('Order_ID')['Product_ID'].nunique().reset_index()
df_products_pO.columns = ['Order_ID', 'Unique_Products']

# group by Order_ID and summed Sales
df_order_sales = df_superstore.groupby('Order_ID')['Sales'].sum().reset_index()
df_order_sales.columns = ['Order_ID', 'Sum_Sales']

# call the numbers
print(f"On average there are {df_products_pO['Unique_Products'].mean():.1f} different products per order" 
      f" and a value of {df_order_sales['Sum_Sales'].mean():,.2f}$.")

On average there are 2.0 different products per order and a value of 458.22$.


Can we get prices for products - are they different due to discount or also due to amount per order (4 chairs f.e.)

In [60]:
# are there different prices for the same Product_ID?

df_prices = df_superstore.groupby('Product_ID')['Sales'].apply(list)

df_prices.head()

Product_ID
FUR-BO-10000112                                          [825.17]
FUR-BO-10000330                          [411.33, 411.33, 241.96]
FUR-BO-10000362         [359.06, 290.67, 136.78, 1025.88, 341.96]
FUR-BO-10000468    [145.74, 194.32, 102.02, 48.58, 77.73, 155.46]
FUR-BO-10000711                                  [638.82, 212.94]
Name: Sales, dtype: object

### RFM frame : customer habbits 

In [None]:
# how many orders per customer