In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
from scipy.stats import shapiro
from scipy.stats import spearmanr, pearsonr, kendalltau
from scipy.stats import f_oneway
from scipy.stats import chi2_contingency
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.formula.api import ols

#### Types of statistics

1. Descriptive analysis
2. Inferential analysis

In [2]:
Salesdata = pd.read_csv('Cleanedsales.csv')
Salesdata

Unnamed: 0.1,Unnamed: 0,Date,Year,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
0,0,2013-11-06,2013,11019,19,Male,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,360,950
1,1,2015-11-06,2015,11019,19,Male,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,360,950
2,2,2014-03-03,2014,11039,49,Male,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1035,2401
3,3,2016-03-03,2016,11039,49,Male,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,900,2088
4,4,2014-04-25,2014,11046,47,Female,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,180,418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113031,113031,2016-03-23,2016,29443,41,Male,United Kingdom,England,Clothing,Vests,"Classic Vest, S",3,24,64,72,184
113032,113032,2014-03-13,2014,29462,18,Male,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,528,1183
113033,113033,2016-03-13,2016,29462,18,Male,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,528,1183
113034,113034,2014-02-12,2014,29472,37,Female,France,Seine (Paris),Clothing,Vests,"Classic Vest, L",24,24,64,576,1260


In [3]:
Salesdata = Salesdata.loc[:, ~Salesdata.columns.str.contains('^Unnamed')]
Salesdata

Unnamed: 0,Date,Year,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
0,2013-11-06,2013,11019,19,Male,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,360,950
1,2015-11-06,2015,11019,19,Male,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,360,950
2,2014-03-03,2014,11039,49,Male,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1035,2401
3,2016-03-03,2016,11039,49,Male,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,900,2088
4,2014-04-25,2014,11046,47,Female,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,180,418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113031,2016-03-23,2016,29443,41,Male,United Kingdom,England,Clothing,Vests,"Classic Vest, S",3,24,64,72,184
113032,2014-03-13,2014,29462,18,Male,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,528,1183
113033,2016-03-13,2016,29462,18,Male,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,528,1183
113034,2014-02-12,2014,29472,37,Female,France,Seine (Paris),Clothing,Vests,"Classic Vest, L",24,24,64,576,1260


In [4]:
# 1. descriptive analysis
Salesdata.describe()

Unnamed: 0,Year,Customer ID,Customer Age,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
count,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0
mean,2014.401739,19227.874341,35.919212,11.90166,267.296366,452.938427,469.318695,754.37036
std,1.27251,5307.581302,11.021936,9.561857,549.835483,922.071219,884.866118,1309.094674
min,2011.0,11000.0,17.0,1.0,1.0,2.0,1.0,2.0
25%,2013.0,14611.0,28.0,2.0,2.0,5.0,28.0,63.0
50%,2014.0,18664.0,35.0,10.0,9.0,24.0,108.0,223.0
75%,2016.0,23475.0,43.0,20.0,42.0,70.0,432.0,800.0
max,2016.0,29483.0,87.0,32.0,2171.0,3578.0,42978.0,58074.0


In [5]:
# Exclude unwanted columns before description
columns_to_exclude = ['Date', 'Year', 'Customer ID']
filtered_df = Salesdata.drop(columns=columns_to_exclude)

# Perform descriptive analysis on relevant data
filtered_df.describe()


Unnamed: 0,Customer Age,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
count,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0
mean,35.919212,11.90166,267.296366,452.938427,469.318695,754.37036
std,11.021936,9.561857,549.835483,922.071219,884.866118,1309.094674
min,17.0,1.0,1.0,2.0,1.0,2.0
25%,28.0,2.0,2.0,5.0,28.0,63.0
50%,35.0,10.0,9.0,24.0,108.0,223.0
75%,43.0,20.0,42.0,70.0,432.0,800.0
max,87.0,32.0,2171.0,3578.0,42978.0,58074.0


In [8]:
# Check our UnitPrice column for normality between the values

Ho = "There is no significant difference i.e data is normally distributed"
H1 = "There is a significant difference i.e data is not normally distributed"

stats, p_value = shapiro(Salesdata['Customer Age'].values)

if p_value > 0.05:
    print(f'{Ho} for Unit Price (p-value = {p_value})')
else:
    print(f'{H1} for Unit Price (p-value = {p_value})')

stats

There is a significant difference i.e data is not normally distributed for Unit Price (p-value = 8.346692177650135e-81)


  res = hypotest_fun_out(*samples, **kwds)


np.float64(0.9737821652249211)

In [9]:
# 2. T-test for 3 or more values together
Ho = "There is no significant difference i.e data is normally distributed"
H1 = "There is a significant difference i.e data is not normally"

col_test = {}

col_list = ['Unit Price', 'Order Quantity', 'Revenue']

for col in col_list:
    stats, p_value = shapiro(Salesdata[col].values)
    stats_list = [stats, p_value]
    col_test.update({col: stats_list})

for col, stats_pvalue in col_test.items():
    stats, p_value = stats_pvalue
    if p_value > 0.05:
        print(f'{Ho} for {col} (p-value = {p_value})')
    else:
        print(f'{H1} for {col} (p-value = {p_value})')


There is a significant difference i.e data is not normally for Unit Price (p-value = 4.4635882379389485e-161)
There is a significant difference i.e data is not normally for Order Quantity (p-value = 8.600496861970009e-116)
There is a significant difference i.e data is not normally for Revenue (p-value = 1.832585965483697e-158)


  res = hypotest_fun_out(*samples, **kwds)


#### CORRELATION

In [10]:
pearsonr_corr = Salesdata[['Unit Price','Revenue']].corr('pearson')
pearsonr_corr

Unnamed: 0,Unit Price,Revenue
Unit Price,1.0,0.818522
Revenue,0.818522,1.0


In [13]:
pearsonr_corr = Salesdata[['Order Quantity','Revenue']].corr('pearson')
pearsonr_corr

Unnamed: 0,Order Quantity,Revenue
Order Quantity,1.0,-0.312895
Revenue,-0.312895,1.0


In [12]:
#2. Spearman corr
spearmanr_corr = Salesdata[['Order Quantity', 'Revenue']].corr('spearman')
spearmanr_corr

Unnamed: 0,Order Quantity,Revenue
Order Quantity,1.0,-0.166543
Revenue,-0.166543,1.0


In [None]:
#3. KendallTau