# Analysis of 2019 Sales

## Data Wrangling & Cleansing

### Setup

In [406]:
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
%reload_ext autoreload
%autoreload 2

### Collate source data

In [407]:
# import os

# df = pd.DataFrame()

# for file in os.listdir('./sales_data'):
#     month = pd.read_csv(f'./sales_data/{ file }', skip_blank_lines=True)
#     df = pd.concat([df, month], ignore_index=True, copy=False)
#     print(f'{file} { month.shape } => { df.shape }')

# df.tail()

### Remove superfluous header rows (interleaved in the data)

In [408]:
# df[df['Order ID'] == 'Order ID'].describe()

Attempting to drop headers via `read_csv()` above added columns. And as
there's `355` header rows within the data, I'm removing altogether here

In [409]:
# # Drop header rows
# df = df[df['Order ID'] != 'Order ID']

# df[df['Order ID'] == 'Order ID'].describe()

### Remove superfluous NaN rows

In [410]:
# df[df.isna().any(axis=1)].info()
# df[df.isna().any(axis=1)].tail()

`545` rows containing `NaN` (or equivalent) in `any` column

In [411]:
# # Drop NaN rows
# df = df[~df.isna().any(axis=1)]

# df[df.isna().any(axis=1)].info()

### Persist combined dataframe

In [412]:
# df.to_csv('2019-sales.cleaned.csv', index=False)

### Reload dataframe `df` from combined file

... with the added benefit that Pandas correctly identifies most column data types

In [413]:
df = pd.read_csv('2019-sales.cleaned.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Order ID          185950 non-null  int64  
 1   Product           185950 non-null  object 
 2   Quantity Ordered  185950 non-null  int64  
 3   Price Each        185950 non-null  float64
 4   Order Date        185950 non-null  object 
 5   Purchase Address  185950 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 8.5+ MB


### Order Date => `datetime`

In [414]:
df['Order Date']

0         04/19/19 08:46
1         04/07/19 22:30
2         04/12/19 14:38
3         04/12/19 14:38
4         04/30/19 09:27
               ...      
185945    09/17/19 20:56
185946    09/01/19 16:00
185947    09/23/19 07:39
185948    09/19/19 17:30
185949    09/30/19 00:18
Name: Order Date, Length: 185950, dtype: object

In [415]:
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%m/%d/%y %H:%M')
df['Order Date']

0        2019-04-19 08:46:00
1        2019-04-07 22:30:00
2        2019-04-12 14:38:00
3        2019-04-12 14:38:00
4        2019-04-30 09:27:00
                 ...        
185945   2019-09-17 20:56:00
185946   2019-09-01 16:00:00
185947   2019-09-23 07:39:00
185948   2019-09-19 17:30:00
185949   2019-09-30 00:18:00
Name: Order Date, Length: 185950, dtype: datetime64[ns]

In [416]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Order ID          185950 non-null  int64         
 1   Product           185950 non-null  object        
 2   Quantity Ordered  185950 non-null  int64         
 3   Price Each        185950 non-null  float64       
 4   Order Date        185950 non-null  datetime64[ns]
 5   Purchase Address  185950 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 8.5+ MB


## Data Elaboration

In [417]:
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558,USB-C Charging Cable,2,11.95,2019-04-19 08:46:00,"917 1st St, Dallas, TX 75001"
1,176559,Bose SoundSport Headphones,1,99.99,2019-04-07 22:30:00,"682 Chestnut St, Boston, MA 02215"
2,176560,Google Phone,1,600.0,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001"
3,176560,Wired Headphones,1,11.99,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001"
4,176561,Wired Headphones,1,11.99,2019-04-30 09:27:00,"333 8th St, Los Angeles, CA 90001"


### Line Total

In [418]:
df['Line Total'] = df['Quantity Ordered'] * df['Price Each']

### Order Date

In [419]:
import calendar

# months = [calendar.month(2019, 1 + month) for month in range(12)]
# print(''.join(months))

months = [calendar.monthcalendar(2019, 1 + month) for month in range(12)]

def week_of_month(month = 1, day = 1):
    # print(months[month - 1])
    for idx, week in enumerate(months[month - 1]):
        if day in week:
            return idx + 1


df['quarter_change'] = (df['Order Date'].dt.is_quarter_start | df['Order Date'].dt.is_quarter_end)
df['quarter'] = df['Order Date'].dt.quarter
df['month_change'] = (df['Order Date'].dt.is_month_start | df['Order Date'].dt.is_month_end)
df['month'] = df['Order Date'].dt.month_name()
df['day'] = df['Order Date'].dt.day_name()
df['week_of_month'] = df['Order Date'].apply(lambda r: week_of_month(r.month, r.day))

df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Line Total,quarter_change,quarter,month_change,month,day,week_of_month
0,176558,USB-C Charging Cable,2,11.95,2019-04-19 08:46:00,"917 1st St, Dallas, TX 75001",23.9,False,2,False,April,Friday,3
1,176559,Bose SoundSport Headphones,1,99.99,2019-04-07 22:30:00,"682 Chestnut St, Boston, MA 02215",99.99,False,2,False,April,Sunday,1
2,176560,Google Phone,1,600.0,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001",600.0,False,2,False,April,Friday,2
3,176560,Wired Headphones,1,11.99,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001",11.99,False,2,False,April,Friday,2
4,176561,Wired Headphones,1,11.99,2019-04-30 09:27:00,"333 8th St, Los Angeles, CA 90001",11.99,False,2,True,April,Tuesday,5


### Address

In [420]:
def parse_address(address = ""):
    city = ','.join(address.split(',')[1:])
    city = (city[:-6]).strip()
    return {
        'city': city,
        'state': city[-2:],
    }

# Append multiple columns when parsing the `Purchase Address` column into `city` and `state`
df = df.join(
    df['Purchase Address'].apply(lambda r: pd.Series(parse_address(r)))
)

df.tail()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Line Total,quarter_change,quarter,month_change,month,day,week_of_month,city,state
185945,259353,AAA Batteries (4-pack),3,2.99,2019-09-17 20:56:00,"840 Highland St, Los Angeles, CA 90001",8.97,False,3,False,September,Tuesday,4,"Los Angeles, CA",CA
185946,259354,iPhone,1,700.0,2019-09-01 16:00:00,"216 Dogwood St, San Francisco, CA 94016",700.0,False,3,True,September,Sunday,1,"San Francisco, CA",CA
185947,259355,iPhone,1,700.0,2019-09-23 07:39:00,"220 12th St, San Francisco, CA 94016",700.0,False,3,False,September,Monday,5,"San Francisco, CA",CA
185948,259356,34in Ultrawide Monitor,1,379.99,2019-09-19 17:30:00,"511 Forest St, San Francisco, CA 94016",379.99,False,3,False,September,Thursday,4,"San Francisco, CA",CA
185949,259357,USB-C Charging Cable,1,11.95,2019-09-30 00:18:00,"250 Meadow St, San Francisco, CA 94016",11.95,True,3,True,September,Monday,6,"San Francisco, CA",CA
