In [1]:
# Import Libraries and Dependencies
import pandas as pd

### 1. Combine and Clean the Data
#### Import CSVs

In [2]:
# Read the CSV files into DataFrames.
# Read in data 

try:
  sales_2020 = pd.read_csv('Resources/athletic_sales_2020.csv')
  sales_2021 = pd.read_csv('Resources/athletic_sales_2021.csv')
  
except FileNotFoundError as e:
  print(e)

In [3]:
# Display the 2020 sales DataFrame
sales_2020.head()

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,Foot Locker,1185732,1/1/20,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000.0,In-store
1,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,68,83,5644,2426.92,Online
2,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,75,275,206250,61875.0,Outlet
3,Foot Locker,1185732,1/1/20,Northeast,New York,New York,Men's Street Footwear,34,384,13056,6789.12,Outlet
4,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,53,83,4399,1407.68,Outlet


In [4]:
# Display the 2021 sales DataFrame
sales_2021.head()

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,65,750,487500,121875.0,Outlet
1,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,51,233,11883,3208.41,Outlet
2,Kohl's,1189833,1/1/21,Midwest,Montana,Billings,Men's Apparel,50,275,137500,82500.0,Outlet
3,Kohl's,1189833,1/1/21,Midwest,Montana,Billings,Men's Apparel,47,77,3619,2714.25,Online
4,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,64,225,14400,5184.0,Online


#### Check the data types of each DataFrame

In [5]:
# Check the 2020 sales data types.
sales_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1297 entries, 0 to 1296
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   retailer          1297 non-null   object 
 1   retailer_id       1297 non-null   int64  
 2   invoice_date      1297 non-null   object 
 3   region            1297 non-null   object 
 4   state             1297 non-null   object 
 5   city              1297 non-null   object 
 6   product           1297 non-null   object 
 7   price_per_unit    1297 non-null   int64  
 8   units_sold        1297 non-null   int64  
 9   total_sales       1297 non-null   int64  
 10  operating_profit  1297 non-null   float64
 11  sales_method      1297 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 121.7+ KB


In [6]:
# Check the 2021 sales data types.
sales_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8346 entries, 0 to 8345
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   retailer          8346 non-null   object 
 1   retailer_id       8346 non-null   int64  
 2   invoice_date      8346 non-null   object 
 3   region            8346 non-null   object 
 4   state             8346 non-null   object 
 5   city              8346 non-null   object 
 6   product           8346 non-null   object 
 7   price_per_unit    8346 non-null   int64  
 8   units_sold        8346 non-null   int64  
 9   total_sales       8346 non-null   int64  
 10  operating_profit  8346 non-null   float64
 11  sales_method      8346 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 782.6+ KB


#### Combine the sales data by rows.

In [7]:
# Combine the 2020 and 2021 sales DataFrames on the rows and reset the index.
sales = pd.concat([sales_2020, sales_2021], axis='rows').reset_index()

In [8]:
# Check if any values are null.
sales.isna().sum()

index               0
retailer            0
retailer_id         0
invoice_date        0
region              0
state               0
city                0
product             0
price_per_unit      0
units_sold          0
total_sales         0
operating_profit    0
sales_method        0
dtype: int64

In [9]:
# Check the data type of each column
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9643 entries, 0 to 9642
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             9643 non-null   int64  
 1   retailer          9643 non-null   object 
 2   retailer_id       9643 non-null   int64  
 3   invoice_date      9643 non-null   object 
 4   region            9643 non-null   object 
 5   state             9643 non-null   object 
 6   city              9643 non-null   object 
 7   product           9643 non-null   object 
 8   price_per_unit    9643 non-null   int64  
 9   units_sold        9643 non-null   int64  
 10  total_sales       9643 non-null   int64  
 11  operating_profit  9643 non-null   float64
 12  sales_method      9643 non-null   object 
dtypes: float64(1), int64(5), object(7)
memory usage: 979.5+ KB


In [10]:
# Convert the "invoice_date" to a datetime datatype
sales['invoice_date'] = pd.to_datetime(sales['invoice_date'], format='%m/%d/%y')


In [11]:
# Confirm that the "invoice_date" data type has been changed.
sales['invoice_date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 9643 entries, 0 to 9642
Series name: invoice_date
Non-Null Count  Dtype         
--------------  -----         
9643 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 75.5 KB


### 2. Determine which Region Sold the Most Products

#### Using `groupby`

In [12]:
# Show the number products sold for region, state, and city.
sales.columns
columns = ['region','state','city']
values = 'units_sold'
total_product_sold = sales.groupby(columns).agg({values:'sum'})
# Rename the sum to "Total_Products_Sold".

total_product_sold.rename(columns = {'units_sold':'Total_Products_Sold'},inplace= True)


# Show the top 5 results.
total_product_sold.sort_values(by = 'Total_Products_Sold', ascending=False) .head()                   

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Total_Products_Sold
region,state,city,Unnamed: 3_level_1
Northeast,New York,New York,111954
South,Texas,Houston,90322
West,California,San Francisco,85478
West,California,Los Angeles,76384
Southeast,Florida,Miami,73135


#### Using `pivot_table`

In [13]:
# Show the number products sold for region, state, and city.
Total_Products_Sold_pivot = pd.pivot_table(sales, values=values, index=columns, aggfunc='sum')

# Rename the "units_sold" column to "Total_Products_Sold"
Total_Products_Sold_pivot.rename(columns = {'units_sold':'Total_Products_Sold'},inplace= True)

# Show the top 5 results.
Total_Products_Sold_pivot.sort_values(by = 'Total_Products_Sold', ascending=False) .head()  

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Total_Products_Sold
region,state,city,Unnamed: 3_level_1
Northeast,New York,New York,111954
South,Texas,Houston,90322
West,California,San Francisco,85478
West,California,Los Angeles,76384
Southeast,Florida,Miami,73135


### 3. Determine which Region had the Most Sales

#### Using `groupby`

In [14]:
# Show the total sales for the products sold for each region, state, and city.
total_sales = sales.groupby(columns).agg({'total_sales':'sum'})
# Rename the "total_sales" column to "Total Sales"
total_sales.rename(columns = {'total_sales':'Total Sales'},inplace= True)

# Show the top 5 results.
total_sales.sort_values(by = 'Total Sales', ascending= False).map('${:,.2f}'.format).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Total Sales
region,state,city,Unnamed: 3_level_1
Northeast,New York,New York,"$39,801,235.00"
West,California,San Francisco,"$33,973,228.00"
Southeast,Florida,Miami,"$31,600,863.00"
Southeast,South Carolina,Charleston,"$29,285,637.00"
Southeast,Florida,Orlando,"$27,682,851.00"


#### Using `pivot_table`

In [15]:
# Show the total sales for the products sold for each region, state, and city.

Total_sales_pivot = pd.pivot_table(sales, values='total_sales', index=columns, aggfunc='sum')
# Optional: Rename the "total_sales" column to "Total Sales"

Total_sales_pivot.rename(columns = {'total_sales':'Total Sales'},inplace= True)
# Show the top 5 results.
Total_sales_pivot.sort_values(by = 'Total Sales', ascending= False).map('${:,.2f}'.format).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Total Sales
region,state,city,Unnamed: 3_level_1
Northeast,New York,New York,"$39,801,235.00"
West,California,San Francisco,"$33,973,228.00"
Southeast,Florida,Miami,"$31,600,863.00"
Southeast,South Carolina,Charleston,"$29,285,637.00"
Southeast,Florida,Orlando,"$27,682,851.00"


### 4. Determine which Retailer had the Most Sales

#### Using `groupby`

In [16]:
# Show the total sales for the products sold for each retailer, region, state, and city.
columns = ['retailer', 'region', 'state', 'city']
total_sales_retailer = sales.groupby(columns).agg({'total_sales':'sum'})
# Rename the "total_sales" column to "Total Sales"
total_sales_retailer.rename(columns = {'total_sales':'Total Sales'},inplace= True)

# Show the top 5 results.
total_sales_retailer.sort_values(by = 'Total Sales', ascending= False).map('${:,.2f}'.format).head()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total Sales
retailer,region,state,city,Unnamed: 4_level_1
West Gear,West,California,San Francisco,"$32,794,405.00"
Kohl's,West,California,Los Angeles,"$25,127,160.00"
Foot Locker,Northeast,New York,New York,"$25,008,568.00"
West Gear,West,Washington,Seattle,"$24,862,675.00"
Foot Locker,Southeast,South Carolina,Charleston,"$24,822,280.00"


#### Using `pivot_table`

In [17]:
# Show the total sales for the products sold for each retailer, region, state, and city.
Total_sales_pivot_retailer = pd.pivot_table(sales, values='total_sales', index=columns, aggfunc='sum')
# Optional: Rename the "total_sales" column to "Total Sales"

Total_sales_pivot_retailer.rename(columns = {'total_sales':'Total Sales'},inplace= True)
# Show the top 5 results.
Total_sales_pivot_retailer.sort_values(by = 'Total Sales', ascending= False).map('${:,.2f}'.format).head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total Sales
retailer,region,state,city,Unnamed: 4_level_1
West Gear,West,California,San Francisco,"$32,794,405.00"
Kohl's,West,California,Los Angeles,"$25,127,160.00"
Foot Locker,Northeast,New York,New York,"$25,008,568.00"
West Gear,West,Washington,Seattle,"$24,862,675.00"
Foot Locker,Southeast,South Carolina,Charleston,"$24,822,280.00"


### 5. Determine which Retailer Sold the Most Women's Athletic Footwear

In [18]:
# Filter the sales data to get the women's athletic footwear sales data.
women_athletic_footwear = sales[sales['product'].str.contains("Women's Athletic Footwear",case=False)]

#### Using `groupby`

In [19]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.
columns = ['retailer', 'region', 'state', 'city']
total_sales_women = women_athletic_footwear.groupby(columns).agg({'total_sales':'sum'})
# Rename the "total_sales" column to "Total Sales"
total_sales_women.rename(columns = {'total_sales':'Total Sales'},inplace= True)

# Show the top 5 results.
total_sales_women.sort_values(by = 'Total Sales', ascending= False).map('${:,.2f}'.format).head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total Sales
retailer,region,state,city,Unnamed: 4_level_1
West Gear,West,California,San Francisco,"$4,558,561.00"
Foot Locker,Northeast,New York,New York,"$3,433,814.00"
Kohl's,West,California,Los Angeles,"$3,350,432.00"
West Gear,West,Washington,Seattle,"$3,300,656.00"
Foot Locker,Southeast,South Carolina,Charleston,"$3,090,484.00"


#### Using `pivot_table`

In [20]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.
columns = ['retailer', 'region', 'state', 'city']
Total_sales_pivot_women = pd.pivot_table(women_athletic_footwear, values='total_sales',\
                                             index=columns, aggfunc='sum')
# Optional: Rename the "total_sales" column to "Total Sales"

Total_sales_pivot_women.rename(columns = {'total_sales':'Total Sales'},inplace= True)
# Show the top 5 results.
Total_sales_pivot_women.sort_values(by = 'Total Sales', ascending= False).map('${:,.2f}'.format).head()




Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total Sales
retailer,region,state,city,Unnamed: 4_level_1
West Gear,West,California,San Francisco,"$4,558,561.00"
Foot Locker,Northeast,New York,New York,"$3,433,814.00"
Kohl's,West,California,Los Angeles,"$3,350,432.00"
West Gear,West,Washington,Seattle,"$3,300,656.00"
Foot Locker,Southeast,South Carolina,Charleston,"$3,090,484.00"


### 5. Determine the Day with the Most Women's Athletic Footwear Sales

In [21]:
# Create a pivot table with the 'invoice_date' column is the index, and the "total_sales" as the values.
sales_pivot_tble = pd.pivot_table(women_athletic_footwear, values='total_sales', index='invoice_date')

# Optional: Rename the "total_sales" column to "Total Sales"


# Show the table.
sales_pivot_tble.map('${:,.2f}'.format)

Unnamed: 0_level_0,total_sales
invoice_date,Unnamed: 1_level_1
2020-01-04,"$134,247.67"
2020-01-05,"$47,267.00"
2020-01-11,"$43,185.33"
2020-01-17,"$57,671.00"
2020-01-22,"$129,416.67"
...,...
2021-12-22,"$66,802.00"
2021-12-23,"$134,670.33"
2021-12-24,"$38,177.00"
2021-12-25,"$105,058.33"


In [22]:
# Resample the pivot table into daily bins, and get the total sales for each day.
daily_sales = sales_pivot_tble.resample('D').sum()

# Sort the resampled pivot table in ascending order on "Total Sales".
daily_sales.sort_values(by = 'total_sales', ascending= True).map('${:,.2f}'.format)

Unnamed: 0_level_0,total_sales
invoice_date,Unnamed: 1_level_1
2021-01-01,$0.00
2020-09-27,$0.00
2021-07-28,$0.00
2020-09-25,$0.00
2020-09-24,$0.00
...,...
2020-10-08,"$198,699.00"
2020-06-28,"$204,687.50"
2020-07-04,"$215,343.00"
2020-05-24,"$218,768.00"


### 6.  Determine the Week with the Most Women's Athletic Footwear Sales

In [24]:
# Resample the pivot table into weekly bins, and get the total sales for each week.
weekly_sales = sales_pivot_tble.resample('W').sum()

# Sort the resampled pivot table in ascending order on "Total Sales".
weekly_sales.sort_values(by = 'total_sales', ascending= True).map('${:,.2f}'.format)

Unnamed: 0_level_0,total_sales
invoice_date,Unnamed: 1_level_1
2020-10-18,$0.00
2020-11-22,"$26,970.00"
2020-12-13,"$29,458.00"
2020-12-20,"$30,855.00"
2020-12-06,"$31,400.50"
...,...
2021-12-05,"$485,858.42"
2021-05-09,"$491,505.31"
2021-12-12,"$495,134.08"
2021-12-19,"$508,094.31"
