#### 

# Reginol Sales Analysis

# 

In [1]:
#   --- Imoporting important Libraries ---

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')



In [2]:

#  Importing Data

sheets = pd.read_excel('Regional Sales Dataset.xlsx', sheet_name=None)


In [3]:


# Assigning Dataframe to each Sheet



Sales_Orders = sheets['Sales Orders']
Customers = sheets['Customers']
Regions = sheets['Regions']
State_Regions = sheets['State Regions']
Products = sheets['Products']
Budgets_2017 = sheets['2017 Budgets']




## Sales Orders Table Overview and Analysis

### 

In [6]:
Sales_Orders.head(4)

Unnamed: 0,OrderNumber,OrderDate,Customer Name Index,Channel,Currency Code,Warehouse Code,Delivery Region Index,Product Description Index,Order Quantity,Unit Price,Line Total,Total Unit Cost
0,SO - 000225,2014-01-01,126,Wholesale,USD,AXW291,364,27,6,2499.1,14994.6,1824.343
1,SO - 0003378,2014-01-01,96,Distributor,USD,AXW291,488,20,11,2351.7,25868.7,1269.918
2,SO - 0005126,2014-01-01,8,Wholesale,USD,AXW291,155,26,6,978.2,5869.2,684.74
3,SO - 0005614,2014-01-01,42,Export,USD,AXW291,473,7,7,2338.3,16368.1,1028.852


In [11]:
Sales_Orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64104 entries, 0 to 64103
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   ordernumber                64104 non-null  object        
 1   orderdate                  64104 non-null  datetime64[ns]
 2   customer_name_index        64104 non-null  int64         
 3   channel                    64104 non-null  object        
 4   currency_code              64104 non-null  object        
 5   warehouse_code             64104 non-null  object        
 6   delivery_region_index      64104 non-null  int64         
 7   product_description_index  64104 non-null  int64         
 8   order_quantity             64104 non-null  int64         
 9   unit_price                 64104 non-null  float64       
 10  line_total                 64104 non-null  float64       
 11  total_unit_cost            64104 non-null  float64       
dtypes: d

In [32]:
Sales_Orders.duplicated().sum()

0

In [18]:
Sales_Orders['currency_code'].value_counts()

currency_code
USD    64104
Name: count, dtype: int64

**Data Quality Assesment**

- Sales Orders data contains 64,104 sales records.
- Datatype of every column is correct.
- There are no null values or duplicated values present in the dataset.
- The currency_code column contains a single value (USD) indicating all the transactions and prices are in US dollars. We can remove the currency_code column safely.
- We can standardize the column name to lower case and replace the spaces with underscore to ensure data consistency.

In [33]:

Sales_Orders.columns = Sales_Orders.columns.str.replace(' ','_').str.lower()



In [19]:
del Sales_Orders['currency_code']

In [20]:
Sales_Orders.describe()

Unnamed: 0,orderdate,customer_name_index,delivery_region_index,product_description_index,order_quantity,unit_price,line_total,total_unit_cost,profit_per_unit
count,64104,64104.0,64104.0,64104.0,64104.0,64104.0,64104.0,64104.0,64104.0
mean,2016-01-29 01:28:20.935979008,87.480064,495.086609,14.913141,8.441689,2284.380803,19280.682937,1432.083899,852.296904
min,2014-01-01 00:00:00,1.0,1.0,1.0,5.0,167.5,837.5,68.675,25.125
25%,2015-01-13 00:00:00,45.0,247.0,6.0,6.0,1031.8,8019.9,606.216,335.201
50%,2016-01-27 00:00:00,87.0,493.0,15.0,8.0,1855.9,14023.1,1084.4955,603.5025
75%,2017-02-13 00:00:00,130.0,742.0,24.0,10.0,3606.275,27416.4,2046.93375,1201.4775
max,2018-02-28 00:00:00,175.0,994.0,30.0,12.0,6566.0,78711.6,5498.556,3863.22
std,,49.884946,285.645893,8.787032,2.276217,1663.598141,15429.602766,1107.705728,723.278017


**Initial Analysis**

- This dataset contains 64,104 number of sales records spanning from January 2016 to February 2018 with the median sales transaction around January 2016, indicating that the sales are not concentrated in any particular year and are rather fairly ditributed across Years.
- We have not sold any order quantity less than 5 and the highest quatity of the items sold at once is 12 which indicates that all the orders in this dataset are in bulk quantity.
- The lowest price at which the item is sold at is 167 which is comparatively lesser, as around 75% of the price at which the unit is sold at is above 1031. It indicates that most of the revenue comes from mid to highly priced products
- The average of the total_unit_cost is comapratively lower than the average unit price indicating a Positive Gross Margin across most transactions.
- We can create a new column as 'Profit per unit' that will give us the profit gained with each unit.


In [21]:


# Creating New Column as profit_per_unit from above features\


Sales_Orders['profit_per_unit'] = Sales_Orders['unit_price'] - Sales_Orders['total_unit_cost']

Sales_Orders.head(4)



Unnamed: 0,ordernumber,orderdate,customer_name_index,channel,warehouse_code,delivery_region_index,product_description_index,order_quantity,unit_price,line_total,total_unit_cost,profit_per_unit
0,SO - 000225,2014-01-01,126,Wholesale,AXW291,364,27,6,2499.1,14994.6,1824.343,674.757
1,SO - 0003378,2014-01-01,96,Distributor,AXW291,488,20,11,2351.7,25868.7,1269.918,1081.782
2,SO - 0005126,2014-01-01,8,Wholesale,AXW291,155,26,6,978.2,5869.2,684.74,293.46
3,SO - 0005614,2014-01-01,42,Export,AXW291,473,7,7,2338.3,16368.1,1028.852,1309.448


## Customers Table Overview and Analysis

In [28]:
Customers.head(5)

Unnamed: 0,customer_index,customer_names
0,1,Geiss Company
1,2,Jaxbean Group
2,3,Ascend Ltd
3,4,Eire Corp
4,5,Blogtags Ltd


In [30]:
Customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_index  175 non-null    int64 
 1   customer_names  175 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.9+ KB


In [43]:
Customers.duplicated().sum()

0

**Data Quality Assessment**

- Customer dataset contains a total of 175 records.
- Dataset contains no Null or duplicated values, thus requires no strucural correction
- We can change the column name to lower case and replace the spaces with underscore to ensure the data remains consistent and for ease of querying across SQL, Python and other Visulaizing tools.

In [35]:

Customers.columns = Customers.columns.str.replace(' ','_').str.lower()



In [37]:
Customers.tail()

Unnamed: 0,customer_index,customer_names
170,171,Accord Group
171,172,BB17 Company
172,173,H Ltd
173,174,Tekfly Group
174,175,SHISEIDO Ltd


## Regions Table Overview and Analysis

In [38]:
Regions.head(5)

Unnamed: 0,id,name,county,state_code,state,type,latitude,longitude,area_code,population,households,median_income,land_area,water_area,time_zone
0,1,Auburn,Lee County,AL,Alabama,City,32.60986,-85.48078,334,62059,21767,38342,152375113,2646161,America/Chicago
1,2,Birmingham,Shelby County/Jefferson County,AL,Alabama,City,33.52744,-86.79905,205,212461,89972,31061,378353942,6591013,America/Chicago
2,3,Decatur,Limestone County/Morgan County,AL,Alabama,City,34.57332,-86.99214,256,55437,22294,41496,141006257,17594716,America/Chicago
3,4,Dothan,Dale County/Houston County/Henry County,AL,Alabama,City,31.2337,-85.40682,334,68567,25913,42426,232166237,835468,America/Chicago
4,5,Hoover,Shelby County/Jefferson County,AL,Alabama,City,33.37695,-86.80558,205,84848,32789,77146,122016784,2553332,America/Chicago


In [40]:
Regions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             994 non-null    int64  
 1   name           994 non-null    object 
 2   county         994 non-null    object 
 3   state_code     994 non-null    object 
 4   state          994 non-null    object 
 5   type           994 non-null    object 
 6   latitude       994 non-null    float64
 7   longitude      994 non-null    float64
 8   area_code      994 non-null    int64  
 9   population     994 non-null    int64  
 10  households     994 non-null    int64  
 11  median_income  994 non-null    int64  
 12  land_area      994 non-null    int64  
 13  water_area     994 non-null    int64  
 14  time_zone      994 non-null    object 
dtypes: float64(2), int64(7), object(6)
memory usage: 116.6+ KB


In [44]:
Regions.duplicated().sum()

0

**Data Quality Review**

- In Region dataset, we have a total of 993 records.
- Dataset has no Null or Duplicated values as such and the Datatype of every field is right.
- We can change the column name to lower case and replace the spaces with underscore to ensure the data remains consistent and for ease of querying across SQL, Python and other Visulaizing tools.

In [78]:
Regions.head()

Unnamed: 0,id,name,county,state_code,state,type,latitude,longitude,area_code,population,households,median_income,land_area,water_area,time_zone
0,1,Auburn,Lee County,AL,Alabama,City,32.60986,-85.48078,334,62059,21767,38342,152375113,2646161,America/Chicago
1,2,Birmingham,Shelby County/Jefferson County,AL,Alabama,City,33.52744,-86.79905,205,212461,89972,31061,378353942,6591013,America/Chicago
2,3,Decatur,Limestone County/Morgan County,AL,Alabama,City,34.57332,-86.99214,256,55437,22294,41496,141006257,17594716,America/Chicago
3,4,Dothan,Dale County/Houston County/Henry County,AL,Alabama,City,31.2337,-85.40682,334,68567,25913,42426,232166237,835468,America/Chicago
4,5,Hoover,Shelby County/Jefferson County,AL,Alabama,City,33.37695,-86.80558,205,84848,32789,77146,122016784,2553332,America/Chicago


In [48]:
Regions.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,994.0,,,,497.5,287.087385,1.0,249.25,497.5,745.75,994.0
name,994.0,923.0,Springfield,5.0,,,,,,,
county,994.0,412.0,Los Angeles County,42.0,,,,,,,
state_code,994.0,48.0,CA,186.0,,,,,,,
state,994.0,48.0,California,186.0,,,,,,,
type,994.0,12.0,City,719.0,,,,,,,
latitude,994.0,,,,37.731587,5.12948,21.32435,33.943055,39.084735,41.671312,48.75955
longitude,994.0,,,,-94.338524,17.03895,-157.84764,-112.277505,-88.20029,-80.416375,-70.25533
area_code,994.0,,,,611.596579,239.899528,202.0,406.0,626.0,832.0,979.0
population,994.0,,,,154012.155936,369878.041224,50138.0,61076.0,82885.5,127186.75,8550405.0


In [76]:
# Largeest Population

Regions[Regions['population'] == 8550405.0]

In [77]:
# Five Largest Population

Regions[Regions['population'].isin(Regions['population'].sort_values(ascending = True).tail())]

Unnamed: 0,id,name,county,state_code,state,type,latitude,longitude,area_code,population,households,median_income,land_area,water_area,time_zone
120,121,Los Angeles,Los Angeles County,CA,California,City,34.05223,-118.24368,213,3971883,1342761,50205,1214027148,88124562,America/Los Angeles
402,403,Chicago,Cook County/DuPage County,IL,Illinois,City,41.83755,-87.68184,773,2720546,1035436,48522,588808396,17615206,America/Chicago
734,735,Brooklyn,Kings County,NY,New York,Borough,40.62472,-73.95222,347,2636735,880727,32135,180000000,70000000,America/New York
754,755,New York City,Richmond County/Queens County/Kings County/New...,NY,New York,City,40.66347,-73.9387,718,8550405,3113535,53373,780785193,431834008,America/New York
758,759,Queens,Queens County,NY,New York,Borough,40.75,-73.86667,718,2339150,782664,42439,109000000,70000000,America/New York


In [None]:
State_Regions = sheets['State Regions']
Products = sheets['Products']
Budgets_2017

In [79]:
State_Regions.head(4)

Unnamed: 0,Column1,Column2,Column3
0,State Code,State,Region
1,AL,Alabama,South
2,AR,Arkansas,South
3,AZ,Arizona,West


In [80]:
State_Regions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Column1  49 non-null     object
 1   Column2  49 non-null     object
 2   Column3  49 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB


In [81]:
Products.head(4)

Unnamed: 0,Index,Product Name
0,1,Product 1
1,2,Product 2
2,3,Product 3
3,4,Product 4


In [83]:
Products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Index         30 non-null     int64 
 1   Product Name  30 non-null     object
dtypes: int64(1), object(1)
memory usage: 612.0+ bytes


In [84]:
Budgets_2017.head()

Unnamed: 0,Product Name,2017 Budgets
0,Product 1,3016489.209
1,Product 2,3050087.565
2,Product 3,2642352.432
3,Product 4,2885560.824
4,Product 5,3925424.542


In [85]:
Budgets_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Product Name  30 non-null     object 
 1   2017 Budgets  30 non-null     float64
dtypes: float64(1), object(1)
memory usage: 612.0+ bytes
