In [67]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt

from acquire import get_all_data

In [99]:
test_dates = pd.DataFrame(data={'date': ['08/08/88','08/09/1988','08/10/1988','08/11/1988']})
test_dates

Unnamed: 0,date
0,08/08/88
1,08/09/1988
2,08/10/1988
3,08/11/1988


In [100]:
pd.to_datetime(test_dates['date'])

0   1988-08-08
1   1988-08-09
2   1988-08-10
3   1988-08-11
Name: date, dtype: datetime64[ns]

In [4]:
df = get_all_data()

####  A function to convert a date to a datetime data type. Accepts string, list, or pd.Series


In [70]:
def convert_to_datetime(date):
    return pd.to_datetime(date, format = "%m/%d/%y", infer_datetime_format=True)

In [71]:
convert_to_datetime('08/08/88')

Timestamp('1988-08-08 00:00:00')

In [72]:
pd.to_datetime(test_dates['date'])

0   1988-08-08
1   1988-08-09
2   1988-08-10
3   1988-08-11
Name: date, dtype: datetime64[ns]

#### A function to change a datetime to UTC. Accepts string, list, or pd.Series

In [73]:
def convert_to_utc(date):
    return pd.to_datetime(date, utc=True)

In [74]:
convert_to_utc('08-08-88')

Timestamp('1988-08-08 00:00:00+0000', tz='UTC')

#### A function to parse a date column into 6 additional columns (while keeping the original date): year, quarter, month, day of month, day of week, weekend vs. weekday


In [148]:
def parse_date(date):
    new_date = pd.to_datetime(date)
    return pd.DataFrame(data={'date_original': new_date,'year': new_date.dt.year, 'quarter': new_date.dt.quarter, 'month': new_date.dt.month,'day_of_month': new_date.dt.day, 'day_of_week': new_date.dt.weekday_name, 'is_weekend':  new_date.dt.weekday_name.str.startswith('S')})

In [149]:
parse_date(test_dates['date'])

Unnamed: 0,date_original,year,quarter,month,day_of_month,day_of_week,is_weekend
0,1988-08-08,1988,3,8,8,Monday,False
1,1988-08-09,1988,3,8,9,Tuesday,False
2,1988-08-10,1988,3,8,10,Wednesday,False
3,1988-08-11,1988,3,8,11,Thursday,False


#### Add a column to dataframe, sales_total, which is a derived from sale_amount (total items) and item_price.

In [150]:
df.groupby('sale_id').tail(20)

Unnamed: 0,item_id,sale_amount,sale_date,sale_id,store_id,item_brand,item_name,item_price,item_upc12,item_upc14,store_address,store_city,store_state,store_zipcode
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
3,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
4,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
5,1,12.0,"Sun, 06 Jan 2013 00:00:00 GMT",6,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
6,1,10.0,"Mon, 07 Jan 2013 00:00:00 GMT",7,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
7,1,9.0,"Tue, 08 Jan 2013 00:00:00 GMT",8,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
8,1,12.0,"Wed, 09 Jan 2013 00:00:00 GMT",9,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
9,1,9.0,"Thu, 10 Jan 2013 00:00:00 GMT",10,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253


In [170]:
for col in df.columns:
    print(col,':', df[col].dtype)

item_id : int64
sale_amount : float64
sale_id : int64
store_id : int64
item_brand : object
item_name : object
item_price : float64
item_upc12 : object
item_upc14 : object
store_address : object
store_city : object
store_state : object
store_zipcode : object
sales_total : float64
day_of_week : object
day_of_month : int64
diff_total_sales : float64


In [160]:
df['sales_total'] = df['sale_amount'] * df['item_price']

In [161]:
df.head()

Unnamed: 0,item_id,sale_amount,sale_date,sale_id,store_id,item_brand,item_name,item_price,item_upc12,item_upc14,store_address,store_city,store_state,store_zipcode,day_of_week,sales_total
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Tuesday,10.92
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Wednesday,9.24
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Thursday,11.76
3,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Friday,10.92
4,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Saturday,8.4


#### Create a new dataframe that aggregates the sales_total and sale_amount(item count) using sum and median by day of week.

In [162]:
df_new = parse_date(df.sale_date)
df_new.head()

Unnamed: 0,date_original,year,quarter,month,day_of_month,day_of_week,is_weekend
0,2013-01-01,2013,1,1,1,Tuesday,False
1,2013-01-02,2013,1,1,2,Wednesday,False
2,2013-01-03,2013,1,1,3,Thursday,False
3,2013-01-04,2013,1,1,4,Friday,False
4,2013-01-05,2013,1,1,5,Saturday,True


In [163]:
df['day_of_week'] = df_new['day_of_week']

In [164]:
df.columns

Index(['item_id', 'sale_amount', 'sale_date', 'sale_id', 'store_id',
       'item_brand', 'item_name', 'item_price', 'item_upc12', 'item_upc14',
       'store_address', 'store_city', 'store_state', 'store_zipcode',
       'day_of_week', 'sales_total'],
      dtype='object')

In [169]:
def aggregate_by_weekday(df):
    return df.groupby('day_of_week')[['sales_total', 'sale_amount']].agg(['median', 'sum'])

In [170]:
aggregate_by_weekday(df)

Unnamed: 0_level_0,sales_total,sales_total,sale_amount,sale_amount
Unnamed: 0_level_1,median,sum,median,sum
day_of_week,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Friday,250.43,38687730.0,50.0,7198021.0
Monday,186.76,28961180.0,38.0,5385853.0
Saturday,266.97,41161770.0,53.0,7655482.0
Sunday,282.47,43594860.0,57.0,8109705.0
Thursday,234.52,36287270.0,47.0,6749880.0
Tuesday,218.66,33839540.0,44.0,6293481.0
Wednesday,218.92,33928320.0,44.0,6312090.0


#### Explore the pandas DataFrame.diff() function. Create a new column that is the result of the current sales - the previous days sales.


In [173]:
def add_sales_difference(df):
    df['diff_from_last_day'] = df.sales_total.diff()
    return df

In [174]:
add_sales_difference(df)

Unnamed: 0,item_id,sale_amount,sale_date,sale_id,store_id,item_brand,item_name,item_price,item_upc12,item_upc14,store_address,store_city,store_state,store_zipcode,day_of_week,sales_total,diff_from_last_day
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Tuesday,10.92,
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Wednesday,9.24,-1.68
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Thursday,11.76,2.52
3,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Friday,10.92,-0.84
4,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Saturday,8.40,-2.52
5,1,12.0,"Sun, 06 Jan 2013 00:00:00 GMT",6,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Sunday,10.08,1.68
6,1,10.0,"Mon, 07 Jan 2013 00:00:00 GMT",7,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Monday,8.40,-1.68
7,1,9.0,"Tue, 08 Jan 2013 00:00:00 GMT",8,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Tuesday,7.56,-0.84
8,1,12.0,"Wed, 09 Jan 2013 00:00:00 GMT",9,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Wednesday,10.08,2.52
9,1,9.0,"Thu, 10 Jan 2013 00:00:00 GMT",10,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,Thursday,7.56,-2.52


#### Write a function to set the index to be the datetime variable.

In [84]:
def set_index(df):
    return df.set_index('sale_date', inplace=True)

In [85]:
set_index(df)

In [86]:
df.head()

Unnamed: 0_level_0,item_id,sale_amount,sale_id,store_id,item_brand,item_name,item_price,item_upc12,item_upc14,store_address,store_city,store_state,store_zipcode,sales_total,day_of_week,day_of_month,diff_total_sales
sale_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
"Tue, 01 Jan 2013 00:00:00 GMT",1,13.0,1,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,10.92,Tuesday,1,
"Wed, 02 Jan 2013 00:00:00 GMT",1,11.0,2,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,9.24,Wednesday,2,-1.68
"Thu, 03 Jan 2013 00:00:00 GMT",1,14.0,3,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,11.76,Thursday,3,2.52
"Fri, 04 Jan 2013 00:00:00 GMT",1,13.0,4,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,10.92,Friday,4,-0.84
"Sat, 05 Jan 2013 00:00:00 GMT",1,10.0,5,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253,8.4,Saturday,5,-2.52


In [175]:
def prepare_data(df):
    df['sales_total'] = df['sale_amount'] * df['item_price']
    df_new = parse_date(df.sale_date)
    df['year'] = df_new['year']
    df['quarter'] = df_new['quarter']
    df['month'] = df_new['month']
    df['day_of_month'] = df_new['day_of_month']
    df['day_of_week'] = df_new['day_of_week']
    df['is_weekend'] = df_new['is_weekend']
    df = set_index(df)
    df = add_sales_difference(df)
    return df