# Extract

Read data from Excel file

In [1]:
import pandas as pd
df = pd.read_excel("../Data/Amazon 2_Raw.xlsx")
df.dropna(how='any', inplace=True)

df['Month'] = df['Ship Date'].dt.month
df['Year'] = df['Ship Date'].dt.year

df.sample(5)

Unnamed: 0,Order ID,Order Date,Ship Date,EmailID,Geography,Category,Product Name,Sales,Quantity,Profit,Month,Year
311,CA-2014-106964,2014-12-18,2014-12-21,HallieRedmond@gmail.com,"United States,Los Angeles,California",Binders,GBC Plastic Binding Combs,11.808,2,4.2804,12,2014
326,CA-2013-166163,2013-08-16,2013-08-21,CraigYedwab@gmail.com,"United States,Oakland,California",Phones,Nokia Lumia 521 (T-Mobile),71.976,3,7.1976,8,2013
2200,CA-2011-144414,2011-06-17,2011-06-21,GaryHwang@gmail.com,"United States,Seattle,Washington",Binders,GBC DocuBind P400 Electric Binding System,3266.376,3,1061.5722,6,2011
654,CA-2014-124401,2014-09-08,2014-09-13,RubenDartt@gmail.com,"United States,Portland,Oregon",Accessories,Logitech Wireless Marathon Mouse M705,279.944,7,80.4839,9,2014
1312,US-2014-119039,2014-03-07,2014-03-11,BenFerrer@gmail.com,"United States,San Francisco,California",Binders,Economy Binders,14.976,9,5.4288,3,2014


# Transform

Our purpose here is to store only aggregated data at target storage location (such as database).

- Total quantity sold
- Total sales
- Total profit
- At monthly level by category

In [2]:
df_transformed = df.groupby(['Year', 'Month', 'Category']).agg(
    total_quantity = ('Quantity', 'sum'),
    total_sales = ('Sales', 'sum'),
    total_profit = ('Profit', 'sum')
    )\
    .reset_index() \
    .sort_values(['Category', 'Year', 'Month'])

In [3]:
df_transformed.head(20)

Unnamed: 0,Year,Month,Category,total_quantity,total_sales,total_profit
8,2011,2,Accessories,3,239.97,86.3892
20,2011,3,Accessories,19,194.58,31.161
32,2011,4,Accessories,20,1173.56,445.356
47,2011,5,Accessories,13,442.14,100.2543
61,2011,6,Accessories,11,324.342,96.7704
72,2011,7,Accessories,25,1827.916,207.3772
85,2011,8,Accessories,27,667.136,149.0507
100,2011,9,Accessories,29,661.36,200.7122
117,2011,10,Accessories,16,666.98,184.426
133,2011,11,Accessories,26,1404.4,272.1295


# Load (Export to local file)
https://pandas.pydata.org/docs/reference/frame.html#serialization-io-conversion


<img src ='./images/export.png'>

In [5]:
df_transformed.to_csv('sales_transformed.csv', index=False)
# Best to use parquet