In [1]:
import pandas as pd
import numpy as np

# Assignment 1: DataFrame Basics

Hi there!

Can you read in the transactions dataset and report on:

* The number of rows and columns
* The names of the columns
* The datatypes of each column

In [2]:
# A common practice is to create a path variable to pass to read_csv
path = '../retail/transactions.csv'
# common practive is to include _df in the DataFrame name
transactions_df = pd.read_csv(path)
# view column headers and top 5 rows
transactions_df.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [3]:
# number of rows and columns
transactions_df.shape

(83488, 3)

In [4]:
# index and column information
transactions_df.axes

[RangeIndex(start=0, stop=83488, step=1),
 Index(['date', 'store_nbr', 'transactions'], dtype='object')]

In [5]:
# transactions_df column data types
transactions_df.dtypes

date            object
store_nbr        int64
transactions     int64
dtype: object

# Assignment 2: Exploring DataFrames

Hello!

* Can you quickly inspect the first 5 rows of the transactions data? 

* Then, dive a bit more deeply into the data and check if there are any missing values.
* What about the number of unique dates? I want to make sure we didn’t leave any out.
* Finally, can you report the mean, median, min and max of “transactions”?  I want to check for any anomalies in our data.


In [6]:
# views top 5 rows of the transactions data
transactions_df.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [7]:
# check for missing values
transactions_df.isnull().sum()
# or transactions_df.isna().sum()

date            0
store_nbr       0
transactions    0
dtype: int64

In [8]:
# null can also be checked using .info() method
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          83488 non-null  object
 1   store_nbr     83488 non-null  int64 
 2   transactions  83488 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.9+ MB


In [9]:
# check summary statistics for transactions DataFrame
transactions_df.describe(include='all').round()

Unnamed: 0,date,store_nbr,transactions
count,83488,83488.0,83488.0
unique,1682,,
top,2017-08-15,,
freq,54,,
mean,,27.0,1695.0
std,,16.0,963.0
min,,1.0,5.0
25%,,13.0,1046.0
50%,,27.0,1393.0
75%,,40.0,2079.0


# Exercise 3 - Accessing DataFrames

Hi, starting to dive deeper into this data.

I noticed that the first row is the only one from 2013-01-01.

* Can you get me a copy of the DataFrame that excludes that row, and only includes “store_nbr” and “transactions”?
* Also, can you report the number of unique store numbers?
* Finally, return the total number of transactions in millions


In [10]:
# exclude row 1 and include columns store_nbr and transactions
transactions_df.loc[1:, ['store_nbr', 'transactions']]

Unnamed: 0,store_nbr,transactions
1,1,2111
2,2,2358
3,3,3487
4,4,1922
5,5,1903
...,...,...
83483,50,2804
83484,51,1573
83485,52,2255
83486,53,932


In [11]:
# report number of unique store numbers
transactions_df['store_nbr'].nunique()

54

In [12]:
# return total number of transactions in millions (divide by 1000000)
transactions_df['transactions'].sum()/1000000

141.478945

# Assignment 4: Dropping Data and Duplicates

Hi there!

Can you:

1. Drop the first row of data? We want it permanently removed. 
2. Drop the date column but not in place
3. Return a dataframe that only includes the last row for each of the stores.

Thanks!

In [13]:
transactions_df.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [14]:
# drop first row (index=0) of data only (axis=0), updated current DataFrame (inplace=True)
transactions_df.drop(index=0, axis=0, inplace=True)
transactions_df

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [15]:
# drop the date column (axis=1) and don't keep original in DataFrame (inplace=False)
transactions_df.drop(['date'], axis=1, inplace=False)

Unnamed: 0,store_nbr,transactions
1,1,2111
2,2,2358
3,3,3487
4,4,1922
5,5,1903
...,...,...
83483,50,2804
83484,51,1573
83485,52,2255
83486,53,932


In [16]:
# Return DataFrame that only shows the last row for each store number (i would prefer to have the index reset with this dataframe)
transactions_df.drop_duplicates(subset='store_nbr', keep='last').head()

Unnamed: 0,date,store_nbr,transactions
83434,2017-08-15,1,1693
83435,2017-08-15,2,1737
83436,2017-08-15,3,2956
83437,2017-08-15,4,1283
83438,2017-08-15,5,1310


# Assignment 5: Missing Data

Hello, 

Can you tell if any dates or prices are missing in the oil dataset?

Then compare the mean of the oil series when filling in with mean vs. filling in with 0.

Thanks!

In [17]:
oil = pd.read_csv("../retail/oil.csv")
oil

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
...,...,...
1213,2017-08-25,47.65
1214,2017-08-28,46.40
1215,2017-08-29,46.46
1216,2017-08-30,45.96


In [18]:
# identify missing data
oil.isna().sum()

date           0
dcoilwtico    43
dtype: int64

In [19]:
# find mean of oil_series when filling in with 0
fill_zero_mean = oil['dcoilwtico'].fillna(0).mean()

In [20]:
# find mean of oil_series when filling in with mean
fill_mean_mean = oil['dcoilwtico'].fillna(oil['dcoilwtico'].mean()).mean()

In [21]:
# comparison between fill_zero and fill_mean value means
(fill_zero_mean-fill_mean_mean)/fill_mean_mean

-0.035303776683087054

# Assignment 6: Filtering DataFrames

I need some quick research on store 25:

* First, calculate the percentage of times ALL stores had more than 2000 transactions
* Then, calculate the percentage of times store 25 had more than 2000 transactions, and calculate the sum of transactions on these days
* Finally, sum the transactions for stores 25 and 31, that occurred in May or June, and had less than 2000 transactions


In [22]:
transactions_df.head()

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [23]:
transactions_df.dtypes

date            object
store_nbr        int64
transactions     int64
dtype: object

In [24]:
# calculate percent of times all stores have more than 2000 transactions
allstoresmean = (transactions_df['transactions'] > 2000).mean()
allstoresmean

0.266808006036868

In [25]:
# create mask to filter store_nbr 25 with transactions > 2000
store25mask = (
    ((transactions_df['transactions'] > 2000) & 
    (transactions_df['store_nbr'] == 25))
    )


In [26]:
# (transactions.loc[mask, 'transactions'].count() 
#  / transactions.loc[(transactions['store_nbr'] == 25), 'transactions'].count())

In [27]:
# calculate the percent of days that transactions are greater than 2000 -------- I didn't understand what this was asking......
# Then it wasn't working because i didn't use .loc[]
# Breaking down the steps
# count the number of transactions for store 25 that are great than 2000
(transactions_df.loc[store25mask, 'transactions'].count())


56

In [28]:
# count the number of transactions that store 25 had in total
(transactions_df.loc[(transactions_df['store_nbr'] == 25), 'transactions'].count())

1614

In [29]:
# put it altogether
(transactions_df.loc[store25mask, 'transactions'].count()) /(transactions_df.loc[(transactions_df['store_nbr'] == 25), 'transactions'].count())

0.03469640644361834

In [30]:
# sum days in June and May where Store 25 and 31 had less than 2000 transactions

# store 25 and 31 mask and transactions are less than 2000 that occur in May '-05-' or '-06-
store25_31trans = (
    (transactions_df['store_nbr'].isin([25,31])) & 
    (transactions_df['transactions'] < 2000) & 
        ((transactions_df['date'].str.contains('-05-')) |
        (transactions_df['date'].str.contains('-06-')))
    )

# sum the total transactions
transactions_df.loc[store25_31trans, 'transactions'].sum()

644910

# Assignment 7: Sorting DataFrames

Hi there,
* Can you get me a dataset that includes the 5 days with the highest transactions counts? Any similarities between them?
* Then, can you get me a dataset sorted by date from earliest to most recent, but with the highest transactions first and the lowest transactions last for each day?
* Finally, sort the columns in reverse alphabetical order. 

Thanks!


In [31]:
transactions_df.head()

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [32]:
# sort transactions to have the 5 highest transactions
transactions_df.sort_values('transactions', ascending=False).head()
# they were all the day before christmas eve.

Unnamed: 0,date,store_nbr,transactions
52011,2015-12-23,44,8359
71010,2016-12-23,44,8307
16570,2013-12-23,44,8256
33700,2014-12-23,44,8120
16572,2013-12-23,46,8001


In [33]:
# sort transactions to have the date ascending and the transactions descending
transactions_df.sort_values(['date','transactions'], ascending=[True, False]).head()

Unnamed: 0,date,store_nbr,transactions
40,2013-01-02,46,4886
38,2013-01-02,44,4821
39,2013-01-02,45,4208
41,2013-01-02,47,4161
11,2013-01-02,11,3547


In [34]:
transactions_df.sort_index(axis=1, ascending=False).head()

Unnamed: 0,transactions,store_nbr,date
1,2111,1,2013-01-02
2,2358,2,2013-01-02
3,3487,3,2013-01-02
4,1922,4,2013-01-02
5,1903,5,2013-01-02


# Assignment 8: Modifying Columns

Just some quick work, but can you send me the transaction data with the columns renamed?

* Rename `transactions` to `transaction_count` and `store_nbr` to `store_number`.
* Reorder the columns so date is first, then store number, then transaction count.

Thanks!


In [35]:
transactions_df.head()

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [36]:
# can rename columns using .rename method with dictionary {}store_nbr = store_number, transactions = transaction_count using assignment but creates new DataFrame
transactions = transactions_df.rename(columns = {'store_nbr':'store_number', 'transactions':'transaction_count'})

In [37]:
# reindex columns to have store_number last
transactions.reindex(labels=['date', 'transaction_count', 'store_number'], axis=1).head()

Unnamed: 0,date,transaction_count,store_number
1,2013-01-02,2111,1
2,2013-01-02,2358,2
3,2013-01-02,3487,3
4,2013-01-02,1922,4
5,2013-01-02,1903,5


In [38]:
# rename columns store_nbr = store_number, transactions = transaction_count using assignment inplace
transactions_df.columns = ['date', 'store_number', 'transaction_count']
transactions_df.head()

Unnamed: 0,date,store_number,transaction_count
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [39]:
# reindex columns to have store_number last for original DataFrame
transactions_df.reindex(
    labels=['date', 'transaction_count', 'store_number'], axis=1
).head()

Unnamed: 0,date,transaction_count,store_number
1,2013-01-02,2111,1
2,2013-01-02,2358,2
3,2013-01-02,3487,3
4,2013-01-02,1922,4
5,2013-01-02,1903,5


# Assignment 9: Column Creation

Just some quick work, but can you send me the transaction data with the columns renamed?

* Create a `pct_to_target` column that divides transactions by 2500.
* Then, create a `met_target` column that returns True if `pct_to_target` is greater than or equal to 1.
* Next, create a `bonus_payable` column that equals 100 if `met_target` is True, and 0 if not. Then sum the bonus payable column.
* Finally, create columns for month and day of week as integers. There is some helper code for these dateparts below.



Thanks!



In [40]:
transactions.head()

Unnamed: 0,date,store_number,transaction_count
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [41]:
# create column pct_to_target which divides transaction_count by 2500
transactions.loc[:, 'pct_to_target'] = transactions['transaction_count'] / 2500
transactions.head()

Unnamed: 0,date,store_number,transaction_count,pct_to_target
1,2013-01-02,1,2111,0.8444
2,2013-01-02,2,2358,0.9432
3,2013-01-02,3,3487,1.3948
4,2013-01-02,4,1922,0.7688
5,2013-01-02,5,1903,0.7612


In [42]:
# create met_target that returns True if Target is Great Than/Equal to 1
transactions.loc[:, 'met_target'] = transactions['pct_to_target'] >= 1
transactions.head()

Unnamed: 0,date,store_number,transaction_count,pct_to_target,met_target
1,2013-01-02,1,2111,0.8444,False
2,2013-01-02,2,2358,0.9432,False
3,2013-01-02,3,3487,1.3948,True
4,2013-01-02,4,1922,0.7688,False
5,2013-01-02,5,1903,0.7612,False


In [43]:
# create a `bonus_payable` column that equals 100 if `met_target` is True, and 0 if not. 
transactions.loc[:, 'bonus_payable'] = 100 * (transactions['pct_to_target'] >= 1 )
transactions.head()

Unnamed: 0,date,store_number,transaction_count,pct_to_target,met_target,bonus_payable
1,2013-01-02,1,2111,0.8444,False,0
2,2013-01-02,2,2358,0.9432,False,0
3,2013-01-02,3,3487,1.3948,True,100
4,2013-01-02,4,1922,0.7688,False,0
5,2013-01-02,5,1903,0.7612,False,0


In [44]:
# sum the bonus payable column
transactions.loc[:, 'bonus_payable'].sum()

1448300

In [45]:
# Create integer Date Columns based on the 'date' column
# change datatype of 'date' to datetime64 so it can be parsed
transactions['date'] = transactions['date'].astype('datetime64[ns]')

# parse the 'month' from the date 'column'
transactions["month"] = transactions["date"].dt.month
# parse the 'day' from the date 'column'
transactions["day_of_week"] = transactions["date"].dt.dayofweek

transactions

Unnamed: 0,date,store_number,transaction_count,pct_to_target,met_target,bonus_payable,month,day_of_week
1,2013-01-02,1,2111,0.8444,False,0,1,2
2,2013-01-02,2,2358,0.9432,False,0,1,2
3,2013-01-02,3,3487,1.3948,True,100,1,2
4,2013-01-02,4,1922,0.7688,False,0,1,2
5,2013-01-02,5,1903,0.7612,False,0,1,2
...,...,...,...,...,...,...,...,...
83483,2017-08-15,50,2804,1.1216,True,100,8,1
83484,2017-08-15,51,1573,0.6292,False,0,8,1
83485,2017-08-15,52,2255,0.9020,False,0,8,1
83486,2017-08-15,53,932,0.3728,False,0,8,1


# Assignment 10: np.select

Hi there! I need a few columns created.

1. Create a ‘seasonal_bonus’ column that applies to these dates: 
    * All days in December (month = 12)
    * Sundays (day_of_week = 6) in May (month = 5)
    * Mondays (day_of_week = 0) in July (month = 7)
2. Call the December bonus ‘Holiday Bonus’, the May bonus ‘Corporate Month’, and the July bonus ‘Summer Special’. If no bonus applies, the column should display ‘None’. 
3. Finally, calculate the total bonus owed at $100 per day.

Thanks!

In [46]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83487 entries, 1 to 83487
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               83487 non-null  datetime64[ns]
 1   store_number       83487 non-null  int64         
 2   transaction_count  83487 non-null  int64         
 3   pct_to_target      83487 non-null  float64       
 4   met_target         83487 non-null  bool          
 5   bonus_payable      83487 non-null  int32         
 6   month              83487 non-null  int32         
 7   day_of_week        83487 non-null  int32         
dtypes: bool(1), datetime64[ns](1), float64(1), int32(3), int64(2)
memory usage: 3.6 MB


In [47]:
# create seasonal bonus column based on conditions
bonus_conditions = (
    (transactions['month'] == 12),
    ((transactions['day_of_week'] == 6) & (transactions['month'] == 5)),
    ((transactions['day_of_week'] == 0) & (transactions['month'] == 7))
)

bonus_choices = ['Holiday Bonus', 'Corporate Month', 'Summer Special']

transactions['seasonal_bonus'] = np.select(bonus_conditions, bonus_choices, default = 'None')

In [48]:
# display the counts of bonus
transactions['seasonal_bonus'].value_counts()

seasonal_bonus
None               75258
Holiday Bonus       6028
Summer Special      1103
Corporate Month     1098
Name: count, dtype: int64

In [49]:
# sum of bonus' at 100 per day (boolean -> True = 1, False = 1 = sum of these values)
transactions['seasonal_bonus'].isin(['Holiday Bonus', 'Corporate Month', 'Summer Special']).sum()*100

822900

In [50]:
# can also be done using series from .value_counts method. Have to iloc[1:] to start at index 1 (Holiday Bonus)
transactions['seasonal_bonus'].value_counts().iloc[1:].sum()*100

822900

# Assignment 11: Assign 

* Drop the columns that have been created so far (keep only date, store_number, and transaction count), and recreate them using the assign method.
* Then sum the seasonal bonus owed once again to make sure the numbers are correct.


In [51]:
transactions.columns

Index(['date', 'store_number', 'transaction_count', 'pct_to_target',
       'met_target', 'bonus_payable', 'month', 'day_of_week',
       'seasonal_bonus'],
      dtype='object')

In [52]:
# Drop columns we created in prior exercises
transactions.drop(columns=
    ['pct_to_target', 'met_target', 'bonus_payable', 'month', 'day_of_week', 'seasonal_bonus'], inplace=True
    )

In [53]:
transactions.columns

Index(['date', 'store_number', 'transaction_count'], dtype='object')

In [54]:
transactions

Unnamed: 0,date,store_number,transaction_count
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [57]:
transactions = transactions.assign(
    pct_to_target = transactions['transaction_count'] / 2500,
    met_target = lambda x: x['pct_to_target'] > 1,
    bonus_payable = lambda x: x['pct_to_target'] * 100,
    month = transactions['date'].dt.month,
    day_of_week = transactions['date'].dt.day_of_week,
    seasonal_bonus = np.select(bonus_conditions, bonus_choices, default = 'None'),
)

In [61]:
transactions

Unnamed: 0,date,store_number,transaction_count,pct_to_target,met_target,bonus_payable,month,day_of_week,seasonal_bonus
1,2013-01-02,1,2111,0.8444,False,84.44,1,2,
2,2013-01-02,2,2358,0.9432,False,94.32,1,2,
3,2013-01-02,3,3487,1.3948,True,139.48,1,2,
4,2013-01-02,4,1922,0.7688,False,76.88,1,2,
5,2013-01-02,5,1903,0.7612,False,76.12,1,2,
...,...,...,...,...,...,...,...,...,...
83483,2017-08-15,50,2804,1.1216,True,112.16,8,1,
83484,2017-08-15,51,1573,0.6292,False,62.92,8,1,
83485,2017-08-15,52,2255,0.9020,False,90.20,8,1,
83486,2017-08-15,53,932,0.3728,False,37.28,8,1,


In [60]:
# verify column creation by summing bonus value
transactions['seasonal_bonus'].value_counts().iloc[1:].sum()*100

822900

# Assignment 12: Memory Optimization

Reduce the memory usage of the transactions DataFrame to below 5MB.

In [67]:
transactions.head()

Unnamed: 0,date,store_number,transaction_count,pct_to_target,met_target,bonus_payable,month,day_of_week,seasonal_bonus
1,2013-01-02,1,2111,0.8444,False,84.44,1,2,
2,2013-01-02,2,2358,0.9432,False,94.32,1,2,
3,2013-01-02,3,3487,1.3948,True,139.48,1,2,
4,2013-01-02,4,1922,0.7688,False,76.88,1,2,
5,2013-01-02,5,1903,0.7612,False,76.12,1,2,


In [66]:
transactions.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83487 entries, 1 to 83487
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               83487 non-null  string 
 1   store_number       83487 non-null  int64  
 2   transaction_count  83487 non-null  int64  
 3   pct_to_target      83487 non-null  float64
 4   met_target         83487 non-null  bool   
 5   bonus_payable      83487 non-null  float64
 6   month              83487 non-null  int32  
 7   day_of_week        83487 non-null  int32  
 8   seasonal_bonus     83487 non-null  object 
dtypes: bool(1), float64(2), int32(2), int64(2), object(1), string(1)
memory usage: 13.5 MB


In [68]:
# use dictionary to convert multiple columns to lower memory_usage amount
transactions = transactions.astype(
    {'date':'datetime64[ns]', 'met_target': 'category', 'seasonal_bonus': 'category',}
    )

In [69]:
# the conversion saved 9.5 MB
transactions.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83487 entries, 1 to 83487
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               83487 non-null  datetime64[ns]
 1   store_number       83487 non-null  int64         
 2   transaction_count  83487 non-null  int64         
 3   pct_to_target      83487 non-null  float64       
 4   met_target         83487 non-null  category      
 5   bonus_payable      83487 non-null  float64       
 6   month              83487 non-null  int32         
 7   day_of_week        83487 non-null  int32         
 8   seasonal_bonus     83487 non-null  category      
dtypes: category(2), datetime64[ns](1), float64(2), int32(2), int64(2)
memory usage: 4.0 MB
