In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px 
from plotly.subplots import make_subplots
from scipy.stats import shapiro
from scipy.stats import spearmanr, pearsonr, kendalltau
from scipy.stats import f_oneway
from scipy.stats import chi2_contingency
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.formula.api import ols


In [2]:
salesdata = pd.read_csv('Cleanedsales.csv')

In [3]:
salesdata.head()

Unnamed: 0.1,Unnamed: 0,Date,Year,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
0,0,2013-11-06,2013,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,360,950
1,1,2015-11-06,2015,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,360,950
2,2,2014-03-03,2014,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1035,2401
3,3,2016-03-03,2016,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,900,2088
4,4,2014-04-25,2014,11046,47,F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,180,418


## Types of Statistics

1. Descriptive analysis
2. Inferential analysis

In [4]:
# 1. Descriptive analysis = it helps to describe a data

salesdata.describe()


Unnamed: 0.1,Unnamed: 0,Year,Customer ID,Customer Age,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
count,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0
mean,56517.5,2014.401739,19227.874341,35.919212,11.90166,267.296366,452.938427,469.318695,754.37036
std,32630.826851,1.27251,5307.581302,11.021936,9.561857,549.835483,922.071219,884.866118,1309.094674
min,0.0,2011.0,11000.0,17.0,1.0,1.0,2.0,1.0,2.0
25%,28258.75,2013.0,14611.0,28.0,2.0,2.0,5.0,28.0,63.0
50%,56517.5,2014.0,18664.0,35.0,10.0,9.0,24.0,108.0,223.0
75%,84776.25,2016.0,23475.0,43.0,20.0,42.0,70.0,432.0,800.0
max,113035.0,2016.0,29483.0,87.0,32.0,2171.0,3578.0,42978.0,58074.0


In [5]:
salesdata.head()

Unnamed: 0.1,Unnamed: 0,Date,Year,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
0,0,2013-11-06,2013,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,360,950
1,1,2015-11-06,2015,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,360,950
2,2,2014-03-03,2014,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1035,2401
3,3,2016-03-03,2016,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,900,2088
4,4,2014-04-25,2014,11046,47,F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,180,418


In [6]:
# remove the first two columns 

data = salesdata.iloc[:,2:]

description = data.describe

print(description)

<bound method NDFrame.describe of         Year  Customer ID  Customer Age Customer Gender         Country  \
0       2013        11019            19               M          Canada   
1       2015        11019            19               M          Canada   
2       2014        11039            49               M       Australia   
3       2016        11039            49               M       Australia   
4       2014        11046            47               F       Australia   
...      ...          ...           ...             ...             ...   
113031  2016        29443            41               M  United Kingdom   
113032  2014        29462            18               M       Australia   
113033  2016        29462            18               M       Australia   
113034  2014        29472            37               F          France   
113035  2016        29472            37               F          France   

                   State Product Category Sub Category           

In [7]:
# Exclude unwanted columns before description
columns_to_exclude = ['Date', 'Year', 'Customer ID']
filtered_df = salesdata.drop(columns=columns_to_exclude)

# Perform descriptive analysis on relevant data
filtered_df.describe()

Unnamed: 0.1,Unnamed: 0,Customer Age,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
count,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0
mean,56517.5,35.919212,11.90166,267.296366,452.938427,469.318695,754.37036
std,32630.826851,11.021936,9.561857,549.835483,922.071219,884.866118,1309.094674
min,0.0,17.0,1.0,1.0,2.0,1.0,2.0
25%,28258.75,28.0,2.0,2.0,5.0,28.0,63.0
50%,56517.5,35.0,10.0,9.0,24.0,108.0,223.0
75%,84776.25,43.0,20.0,42.0,70.0,432.0,800.0
max,113035.0,87.0,32.0,2171.0,3578.0,42978.0,58074.0


In [8]:
# Check our UnitPrice columns for normality between the values

Ho = 'There is no significant difference i.e data is normally distributed'
H1 = 'There is significant difference i.e data is not normally distributed'

stats, p_value = shapiro(salesdata['Unit Price']. values)

if p_value > 0.05:
    print(f'{Ho} for Unit Price (p_values = {p_value})')
else:
    print(f'{H1} for Unit Price (p_values = {p_value})')

There is significant difference i.e data is not normally distributed for Unit Price (p_values = 0.0)




In [9]:
# T-test for 3 or more value together 

Ho = 'There is no significant difference i.e data is normally distributed'
H1 = 'There is significant difference i.e data is not normally distributed'

col_test = {}

col_list = ['Unit Price', 'Order Quantity', 'Revenue']

for col in col_list:
    stats, p_value = shapiro(salesdata[col].values)
    stats_list =[stats, p_value]
    col_test.update({col: stats_list})

for col, stats_pvalue in col_test.items():
    stats_pvalue = stats_pvalue
    if p_value > 0.05:
        print(f'{Ho} for the {col} (p-value = {p_value})')
    else:
        print(f'{Ho} for the {col} (p-value = {p_value})')

There is no significant difference i.e data is normally distributed for the Unit Price (p-value = 0.0)
There is no significant difference i.e data is normally distributed for the Order Quantity (p-value = 0.0)
There is no significant difference i.e data is normally distributed for the Revenue (p-value = 0.0)


# CORRELATION 
types of correlation 
1. person correlation - it assume that they are normally distributed and use to measure linear (there p_values are greater 0.05)

In [10]:
pearsonr_corr = salesdata[['Unit Price', 'Revenue']].corr('pearson')
pearsonr_corr

Unnamed: 0,Unit Price,Revenue
Unit Price,1.0,0.818522
Revenue,0.818522,1.0


In [11]:
# spearmanr_corr
spearmanr_corr = salesdata[['Order Quantity', 'Revenue']].corr('spearman')
spearmanr_corr

Unnamed: 0,Order Quantity,Revenue
Order Quantity,1.0,-0.166543
Revenue,-0.166543,1.0


In [12]:
# kendallTau

kendalltau_corr = salesdata[['Unit Price', 'Revenue']].corr('spearman')
kendalltau_corr

Unnamed: 0,Unit Price,Revenue
Unit Price,1.0,0.892268
Revenue,0.892268,1.0


In [14]:
model = ols('Revenue ~ Country * State', data= salesdata).fit()
anova_table = sm.stats.anova_lm(model, typ = 2)

print(anova_table)

                     sum_sq        df          F        PR(>F)
Country        1.450012e+05       5.0   0.017065  8.960656e-01
State          1.875552e+09      52.0  21.224246  4.812985e-25
Country:State  1.364100e+08     260.0   0.308730  9.327604e-01
Residual       1.920023e+11  112983.0        NaN           NaN


