## CMPINF 2110 Spring 2021 - Week 02

Tidying shoe counting data set.

## Import modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

## Read data

In [2]:
file_name = 'week_02_shoe_counts.xlsx'

Read in the Noodles N Company data set.

In [3]:
noodles_counts = pd.read_excel( file_name, sheet_name='Noodles')

In [4]:
noodles_counts

Unnamed: 0,W,B,R,O
0,12,5,3,15
1,5,8,6,20
2,9,22,7,13
3,4,2,1,7


Read in the Dunkin data set.

In [80]:
dunkin_counts = pd.read_excel( file_name, sheet_name='Dunkin')

In [81]:
dunkin_counts

Unnamed: 0,W,B,R,O
0,9,8,2,11
1,9,3,8,14
2,2,11,13,5
3,5,8,3,12


## How can we merge these two data sets?

In [82]:
noodles_copy = noodles_counts.copy()

dunkin_copy = dunkin_counts.copy()

Modify the column names, such that I include a letter to designate which location the data come from.

In [83]:
[ 'N' + '_' + w for w in noodles_counts.columns.to_list() ]

['N_W', 'N_B', 'N_R', 'N_O']

In [84]:
noodles_copy.columns = [ 'N' + '_' + w for w in noodles_counts.columns.to_list() ]

In [85]:
dunkin_copy.columns = [ 'D' + '_' + w for w in dunkin_counts.columns.to_list() ]

In [86]:
noodles_copy

Unnamed: 0,N_W,N_B,N_R,N_O
0,12,5,3,15
1,5,8,6,20
2,9,22,7,13
3,4,2,1,7


In [87]:
dunkin_copy

Unnamed: 0,D_W,D_B,D_R,D_O
0,9,8,2,11
1,9,3,8,14
2,2,11,13,5
3,5,8,3,12


Combine the DataFrames **horizontally** or concatenate the two side-by-side.

In [88]:
shoes = pd.concat( [noodles_copy, dunkin_copy], axis=1 )

In [89]:
shoes

Unnamed: 0,N_W,N_B,N_R,N_O,D_W,D_B,D_R,D_O
0,12,5,3,15,9,8,2,11
1,5,8,6,20,9,3,8,14
2,9,22,7,13,2,11,13,5
3,4,2,1,7,5,8,3,12


One row is one day. One column represents one location and one type of shoe.

In [90]:
### extract the location+shoe type column names

loc_shoe_name = shoes.columns.to_list()

In [91]:
loc_shoe_name

['N_W', 'N_B', 'N_R', 'N_O', 'D_W', 'D_B', 'D_R', 'D_O']

Add in a column for the day number.

In [92]:
shoes['day'] = shoes.index.to_numpy() + 1

In [93]:
shoes

Unnamed: 0,N_W,N_B,N_R,N_O,D_W,D_B,D_R,D_O,day
0,12,5,3,15,9,8,2,11,1
1,5,8,6,20,9,3,8,14,2
2,9,22,7,13,2,11,13,5,3
3,4,2,1,7,5,8,3,12,4


In [94]:
shoes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   N_W     4 non-null      int64
 1   N_B     4 non-null      int64
 2   N_R     4 non-null      int64
 3   N_O     4 non-null      int64
 4   D_W     4 non-null      int64
 5   D_B     4 non-null      int64
 6   D_R     4 non-null      int64
 7   D_O     4 non-null      int64
 8   day     4 non-null      int64
dtypes: int64(9)
memory usage: 416.0 bytes


This is a **brittle** data set.

How could we add a new type of shoe?

How could we add a new location?

### Reshaping - wide to long (or tall)

We will use the `.melt()` method to "stack" the columns on top of each other, while treating `day` as the ID variable.

In [95]:
lf = shoes.melt( id_vars=['day'], value_vars=loc_shoe_name )

In [96]:
lf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   day       32 non-null     int64 
 1   variable  32 non-null     object
 2   value     32 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 896.0+ bytes


In [97]:
lf.head()

Unnamed: 0,day,variable,value
0,1,N_W,12
1,2,N_W,5
2,3,N_W,9
3,4,N_W,4
4,1,N_B,5


In [98]:
lf.head(8)

Unnamed: 0,day,variable,value
0,1,N_W,12
1,2,N_W,5
2,3,N_W,9
3,4,N_W,4
4,1,N_B,5
5,2,N_B,8
6,3,N_B,22
7,4,N_B,2


In [99]:
lf.head(12)

Unnamed: 0,day,variable,value
0,1,N_W,12
1,2,N_W,5
2,3,N_W,9
3,4,N_W,4
4,1,N_B,5
5,2,N_B,8
6,3,N_B,22
7,4,N_B,2
8,1,N_R,3
9,2,N_R,6


In [100]:
lf

Unnamed: 0,day,variable,value
0,1,N_W,12
1,2,N_W,5
2,3,N_W,9
3,4,N_W,4
4,1,N_B,5
5,2,N_B,8
6,3,N_B,22
7,4,N_B,2
8,1,N_R,3
9,2,N_R,6


Check, the number of rows associated with each `day`.

In [101]:
lf.day.value_counts()

1    8
2    8
3    8
4    8
Name: day, dtype: int64

How many rows are associated with each value of the new column `variable`.

In [102]:
lf.variable.value_counts()

D_B    4
N_R    4
D_R    4
N_O    4
D_W    4
N_B    4
D_O    4
N_W    4
Name: variable, dtype: int64

### Break up the `variable` column into two columns

In [104]:
lf.dtypes

day          int64
variable    object
value        int64
dtype: object

We want to SPLIT the string ON the underscore character, `'_'`.

In [105]:
lf.variable.str.split( '_' )

0     [N, W]
1     [N, W]
2     [N, W]
3     [N, W]
4     [N, B]
5     [N, B]
6     [N, B]
7     [N, B]
8     [N, R]
9     [N, R]
10    [N, R]
11    [N, R]
12    [N, O]
13    [N, O]
14    [N, O]
15    [N, O]
16    [D, W]
17    [D, W]
18    [D, W]
19    [D, W]
20    [D, B]
21    [D, B]
22    [D, B]
23    [D, B]
24    [D, R]
25    [D, R]
26    [D, R]
27    [D, R]
28    [D, O]
29    [D, O]
30    [D, O]
31    [D, O]
Name: variable, dtype: object

In [106]:
lf.variable.str.split( '_' )[0]

['N', 'W']

In [107]:
print( type( lf.variable.str.split( '_' )[0] ) )

<class 'list'>


If we set the `expand` argument to `str.split()` to be `True` the result will be a DataFrame.

In [108]:
lf.variable.str.split( '_', expand=True)

Unnamed: 0,0,1
0,N,W
1,N,W
2,N,W
3,N,W
4,N,B
5,N,B
6,N,B
7,N,B
8,N,R
9,N,R


Let's create two new columns, `location` and `shoe`.

In [109]:
lf[['location', 'shoe']] = lf.variable.str.split( '_', expand=True )

In [110]:
lf

Unnamed: 0,day,variable,value,location,shoe
0,1,N_W,12,N,W
1,2,N_W,5,N,W
2,3,N_W,9,N,W
3,4,N_W,4,N,W
4,1,N_B,5,N,B
5,2,N_B,8,N,B
6,3,N_B,22,N,B
7,4,N_B,2,N,B
8,1,N_R,3,N,R
9,2,N_R,6,N,R


We do not need the original `variable` column. So we will just drop it.

In [111]:
lf.drop(['variable'], axis=1, inplace=True)

In [112]:
lf

Unnamed: 0,day,value,location,shoe
0,1,12,N,W
1,2,5,N,W
2,3,9,N,W
3,4,4,N,W
4,1,5,N,B
5,2,8,N,B
6,3,22,N,B
7,4,2,N,B
8,1,3,N,R
9,2,6,N,R


In [113]:
shoes

Unnamed: 0,N_W,N_B,N_R,N_O,D_W,D_B,D_R,D_O,day
0,12,5,3,15,9,8,2,11,1
1,5,8,6,20,9,3,8,14,2
2,9,22,7,13,2,11,13,5,3
3,4,2,1,7,5,8,3,12,4


## Group and Aggregate

How many rows are there for each type of shoe?

In [114]:
lf.shoe.value_counts()

O    8
B    8
R    8
W    8
Name: shoe, dtype: int64

The total number of shoes for each type of shoe.

In [115]:
lf.groupby(['shoe']).aggregate(total_number = ('value', 'sum'))

Unnamed: 0_level_0,total_number
shoe,Unnamed: 1_level_1
B,67
O,97
R,43
W,55


Double check that this is correct, focusing on the `'B'` shoe type. Filter or select rows based on a condition.

In [116]:
lf.shoe.isin(['B'])

0     False
1     False
2     False
3     False
4      True
5      True
6      True
7      True
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20     True
21     True
22     True
23     True
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
Name: shoe, dtype: bool

In [117]:
lf.shoe.isin(['B', 'O'])

0     False
1     False
2     False
3     False
4      True
5      True
6      True
7      True
8     False
9     False
10    False
11    False
12     True
13     True
14     True
15     True
16    False
17    False
18    False
19    False
20     True
21     True
22     True
23     True
24    False
25    False
26    False
27    False
28     True
29     True
30     True
31     True
Name: shoe, dtype: bool

In [118]:
lf.loc[ lf.shoe.isin(['B']) ]

Unnamed: 0,day,value,location,shoe
4,1,5,N,B
5,2,8,N,B
6,3,22,N,B
7,4,2,N,B
20,1,8,D,B
21,2,3,D,B
22,3,11,D,B
23,4,8,D,B


Calculate the sum of the `value` column.

In [119]:
lf.loc[ lf.shoe.isin(['B']) ].value.sum()

67

Repeat for `shoe` type `'R'`.

In [120]:
lf.loc[ lf.shoe.isin(['R']) ].value.sum()

43

Grouping by multiple columns.

In [121]:
lf.groupby(['location', 'shoe'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x13830df70>

In [122]:
lf.groupby(['location', 'shoe']).size()

location  shoe
D         B       4
          O       4
          R       4
          W       4
N         B       4
          O       4
          R       4
          W       4
dtype: int64

In [123]:
lf.groupby(['location', 'shoe']).aggregate(total_number = ('value', 'sum'), num_days = ('day', 'nunique'))

Unnamed: 0_level_0,Unnamed: 1_level_0,total_number,num_days
location,shoe,Unnamed: 2_level_1,Unnamed: 3_level_1
D,B,30,4
D,O,42,4
D,R,26,4
D,W,25,4
N,B,37,4
N,O,55,4
N,R,17,4
N,W,30,4


In [124]:
lf.groupby(['location', 'shoe']).\
aggregate(total_number = ('value', 'sum'),
         num_days = ('day', 'nunique')).\
reset_index()

Unnamed: 0,location,shoe,total_number,num_days
0,D,B,30,4
1,D,O,42,4
2,D,R,26,4
3,D,W,25,4
4,N,B,37,4
5,N,O,55,4
6,N,R,17,4
7,N,W,30,4


## Bring in the Panera data into our tidy data set

In [125]:
panera_counts = pd.read_excel( file_name, sheet_name = 'Panera')

In [126]:
panera_counts

Unnamed: 0,day,W,B,R,O
0,2,9,3,4,15
1,3,7,4,5,11


We want to merge the Panera data set with the long-format structure. First, reshape the `panera_counts` into a long-format style as well.

In [127]:
panera_lf = panera_counts.melt( id_vars=['day'], value_vars=['W', 'B', 'R', 'O'], var_name='shoe')

In [128]:
panera_lf

Unnamed: 0,day,shoe,value
0,2,W,9
1,3,W,7
2,2,B,3
3,3,B,4
4,2,R,4
5,3,R,5
6,2,O,15
7,3,O,11


Add in the `location` column.

In [129]:
panera_lf['location'] = 'P'

In [130]:
panera_lf

Unnamed: 0,day,shoe,value,location
0,2,W,9,P
1,3,W,7,P
2,2,B,3,P
3,3,B,4,P
4,2,R,4,P
5,3,R,5,P
6,2,O,15,P
7,3,O,11,P


Rearrange the column order to match the long-format data set column order.

In [131]:
lf.columns.to_list()

['day', 'value', 'location', 'shoe']

In [132]:
panera_lf = panera_lf[ lf.columns.to_list() ]

In [133]:
panera_lf

Unnamed: 0,day,value,location,shoe
0,2,9,P,W
1,3,7,P,W
2,2,3,P,B
3,3,4,P,B
4,2,4,P,R
5,3,5,P,R
6,2,15,P,O
7,3,11,P,O


Concat the `lf` data set of Noodles and Dunkin with the long format data set of Panera.

In [134]:
lf_all = pd.concat( [lf, panera_lf], ignore_index=True)

In [135]:
lf_all

Unnamed: 0,day,value,location,shoe
0,1,12,N,W
1,2,5,N,W
2,3,9,N,W
3,4,4,N,W
4,1,5,N,B
5,2,8,N,B
6,3,22,N,B
7,4,2,N,B
8,1,3,N,R
9,2,6,N,R


Rearrange by the day number.

In [136]:
lf_all.sort_values(['day'])

Unnamed: 0,day,value,location,shoe
0,1,12,N,W
20,1,8,D,B
28,1,11,D,O
16,1,9,D,W
8,1,3,N,R
12,1,15,N,O
24,1,2,D,R
4,1,5,N,B
21,2,3,D,B
1,2,5,N,W


Calculate the average number of each shoe entering each location.

In [137]:
shoe_summary = lf_all.groupby(['location', 'shoe']).\
aggregate(total_number = ('value', 'sum'), 
         num_days = ('day', 'nunique')).\
reset_index()

In [138]:
shoe_summary

Unnamed: 0,location,shoe,total_number,num_days
0,D,B,30,4
1,D,O,42,4
2,D,R,26,4
3,D,W,25,4
4,N,B,37,4
5,N,O,55,4
6,N,R,17,4
7,N,W,30,4
8,P,B,7,2
9,P,O,26,2


In [139]:
shoe_summary['avg_per_day'] = shoe_summary.total_number / shoe_summary.num_days

In [140]:
shoe_summary

Unnamed: 0,location,shoe,total_number,num_days,avg_per_day
0,D,B,30,4,7.5
1,D,O,42,4,10.5
2,D,R,26,4,6.5
3,D,W,25,4,6.25
4,N,B,37,4,9.25
5,N,O,55,4,13.75
6,N,R,17,4,4.25
7,N,W,30,4,7.5
8,P,B,7,2,3.5
9,P,O,26,2,13.0


### reshaping...long to wide with PIVOT

In [141]:
lf_all.pivot( index=['day', 'location'], columns='shoe', values='value')

Unnamed: 0_level_0,shoe,B,O,R,W
day,location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,D,8,11,2,9
1,N,5,15,3,12
2,D,3,14,8,9
2,N,8,20,6,5
2,P,3,15,4,9
3,D,11,5,13,2
3,N,22,13,7,9
3,P,4,11,5,7
4,D,8,12,3,5
4,N,2,7,1,4


### Write datasets to CSV for use in topic summaries

In [145]:
lf.to_csv(r'../summaries/tidy_shoes.csv', index = False)

In [147]:
shoes.to_csv(r'../summaries/messy_shoes.csv', index=False)