## CMPINF 2110 Spring 2021 - Week 04

Work with the already tidy long-format shoe counting data set. We compiled this data set in Week 02.

In [127]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

## Read in the data

In [128]:
lf = pd.read_csv( 'shoes_long_format.csv' )

In [129]:
lf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   day       40 non-null     int64 
 1   shoe      40 non-null     object
 2   value     40 non-null     int64 
 3   location  40 non-null     object
dtypes: int64(2), object(2)
memory usage: 1.4+ KB


In [130]:
lf.head()

Unnamed: 0,day,shoe,value,location
0,1,W,12,N
1,2,W,5,N
2,3,W,9,N
3,4,W,4,N
4,1,B,5,N


In [131]:
lf.nunique()

day          4
shoe         4
value       16
location     3
dtype: int64

In [132]:
lf.location.value_counts()

D    16
N    16
P     8
Name: location, dtype: int64

In [133]:
lf.shoe.value_counts()

O    10
W    10
B    10
R    10
Name: shoe, dtype: int64

In [134]:
lf.day.value_counts()

2    12
3    12
1     8
4     8
Name: day, dtype: int64

In [135]:
lf

Unnamed: 0,day,shoe,value,location
0,1,W,12,N
1,2,W,5,N
2,3,W,9,N
3,4,W,4,N
4,1,B,5,N
5,2,B,8,N
6,3,B,22,N
7,4,B,2,N
8,1,R,3,N
9,2,R,6,N


## Locations

In [136]:
location_info = lf.groupby(['location']).size().reset_index(name='num_rows')

In [137]:
location_info

Unnamed: 0,location,num_rows
0,D,16
1,N,16
2,P,8


In [138]:
location_info.drop(columns=['num_rows'], inplace=True)

In [139]:
location_info

Unnamed: 0,location
0,D
1,N
2,P


In [140]:
location_info['location_name'] = pd.Series(['Dunkin Donuts', 'Noodles & Company', 'Panera Bread'], index=location_info.index)

In [141]:
location_info

Unnamed: 0,location,location_name
0,D,Dunkin Donuts
1,N,Noodles & Company
2,P,Panera Bread


In [142]:
location_info['address'] = pd.Series(['3907 Forbes Ave, Pittsburgh, PA 15123',
                                     '3805 Forbes Ave, Pittsburgh, PA 15123',
                                     '3800 Forbes Ave, Pittsburgh, PA 15123'],
                                    index = location_info.index)

In [143]:
location_info

Unnamed: 0,location,location_name,address
0,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123"
1,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123"
2,P,Panera Bread,"3800 Forbes Ave, Pittsburgh, PA 15123"


In [144]:
location_info.address.str.split( '\,', )

0    [3907 Forbes Ave,  Pittsburgh,  PA 15123]
1    [3805 Forbes Ave,  Pittsburgh,  PA 15123]
2    [3800 Forbes Ave,  Pittsburgh,  PA 15123]
Name: address, dtype: object

In [145]:
location_info[['street_address', 'city', 'state_zip']] = location_info.address.str.split('\, ', expand=True)

In [146]:
location_info

Unnamed: 0,location,location_name,address,street_address,city,state_zip
0,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123
1,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123
2,P,Panera Bread,"3800 Forbes Ave, Pittsburgh, PA 15123",3800 Forbes Ave,Pittsburgh,PA 15123


In [147]:
location_info.state_zip.str.split( ' ', expand=True)

Unnamed: 0,0,1
0,PA,15123
1,PA,15123
2,PA,15123


In [148]:
location_info[['state', 'zipcode']] = location_info.state_zip.str.split( ' ', expand=True )

In [149]:
location_info

Unnamed: 0,location,location_name,address,street_address,city,state_zip,state,zipcode
0,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
1,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
2,P,Panera Bread,"3800 Forbes Ave, Pittsburgh, PA 15123",3800 Forbes Ave,Pittsburgh,PA 15123,PA,15123


In [150]:
lf.head()

Unnamed: 0,day,shoe,value,location
0,1,W,12,N
1,2,W,5,N
2,3,W,9,N
3,4,W,4,N
4,1,B,5,N


In [151]:
pd.merge( lf, location_info, on='location', how='left')

Unnamed: 0,day,shoe,value,location,location_name,address,street_address,city,state_zip,state,zipcode
0,1,W,12,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
1,2,W,5,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
2,3,W,9,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
3,4,W,4,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
4,1,B,5,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
5,2,B,8,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
6,3,B,22,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
7,4,B,2,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
8,1,R,3,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
9,2,R,6,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123


In [152]:
pd.merge( lf, location_info, on='location', how='inner')

Unnamed: 0,day,shoe,value,location,location_name,address,street_address,city,state_zip,state,zipcode
0,1,W,12,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
1,2,W,5,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
2,3,W,9,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
3,4,W,4,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
4,1,B,5,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
5,2,B,8,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
6,3,B,22,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
7,4,B,2,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
8,1,R,3,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
9,2,R,6,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123


In [153]:
pd.merge( lf, location_info, on='location', how='right')

Unnamed: 0,day,shoe,value,location,location_name,address,street_address,city,state_zip,state,zipcode
0,1,W,9,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
1,2,W,9,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
2,3,W,2,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
3,4,W,5,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
4,1,B,8,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
5,2,B,3,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
6,3,B,11,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
7,4,B,8,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
8,1,R,2,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
9,2,R,8,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123


In [154]:
pd.merge( lf, location_info, on='location', how='outer')

Unnamed: 0,day,shoe,value,location,location_name,address,street_address,city,state_zip,state,zipcode
0,1,W,12,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
1,2,W,5,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
2,3,W,9,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
3,4,W,4,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
4,1,B,5,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
5,2,B,8,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
6,3,B,22,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
7,4,B,2,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
8,1,R,3,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
9,2,R,6,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123


In [155]:
location_info

Unnamed: 0,location,location_name,address,street_address,city,state_zip,state,zipcode
0,D,Dunkin Donuts,"3907 Forbes Ave, Pittsburgh, PA 15123",3907 Forbes Ave,Pittsburgh,PA 15123,PA,15123
1,N,Noodles & Company,"3805 Forbes Ave, Pittsburgh, PA 15123",3805 Forbes Ave,Pittsburgh,PA 15123,PA,15123
2,P,Panera Bread,"3800 Forbes Ave, Pittsburgh, PA 15123",3800 Forbes Ave,Pittsburgh,PA 15123,PA,15123


In [156]:
location_info.drop(columns=['address', 'state_zip'], inplace=True)

In [157]:
location_info

Unnamed: 0,location,location_name,street_address,city,state,zipcode
0,D,Dunkin Donuts,3907 Forbes Ave,Pittsburgh,PA,15123
1,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
2,P,Panera Bread,3800 Forbes Ave,Pittsburgh,PA,15123


## Shoes

In [158]:
shoe_info = lf.groupby(['shoe']).size().reset_index(name='num_rows')

In [159]:
shoe_info

Unnamed: 0,shoe,num_rows
0,B,10
1,O,10
2,R,10
3,W,10


In [160]:
shoe_info.drop(columns=['num_rows'], inplace=True)

In [161]:
shoe_info

Unnamed: 0,shoe
0,B
1,O
2,R
3,W


In [162]:
shoe_info['shoe_color'] = pd.Series(['Black', 'Other', 'Red', 'White'], index=shoe_info.index)

In [163]:
shoe_info

Unnamed: 0,shoe,shoe_color
0,B,Black
1,O,Other
2,R,Red
3,W,White


In [164]:
pd.merge( lf, shoe_info, on='shoe', how='left').\
merge(location_info, on='location', how='left')

Unnamed: 0,day,shoe,value,location,shoe_color,location_name,street_address,city,state,zipcode
0,1,W,12,N,White,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
1,2,W,5,N,White,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
2,3,W,9,N,White,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
3,4,W,4,N,White,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
4,1,B,5,N,Black,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
5,2,B,8,N,Black,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
6,3,B,22,N,Black,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
7,4,B,2,N,Black,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
8,1,R,3,N,Red,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
9,2,R,6,N,Red,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123


In [165]:
shoe_info

Unnamed: 0,shoe,shoe_color
0,B,Black
1,O,Other
2,R,Red
3,W,White


In [166]:
location_info

Unnamed: 0,location,location_name,street_address,city,state,zipcode
0,D,Dunkin Donuts,3907 Forbes Ave,Pittsburgh,PA,15123
1,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
2,P,Panera Bread,3800 Forbes Ave,Pittsburgh,PA,15123


In [167]:
location_info['location_id'] = location_info.index + 1

In [168]:
location_info

Unnamed: 0,location,location_name,street_address,city,state,zipcode,location_id
0,D,Dunkin Donuts,3907 Forbes Ave,Pittsburgh,PA,15123,1
1,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123,2
2,P,Panera Bread,3800 Forbes Ave,Pittsburgh,PA,15123,3


In [169]:
shoe_info['shoe_id'] = shoe_info.index + 1

In [170]:
shoe_info

Unnamed: 0,shoe,shoe_color,shoe_id
0,B,Black,1
1,O,Other,2
2,R,Red,3
3,W,White,4


Let's have a long-format data set which stores the counts for each type of shoe in each store on a day, using just the IDENTIFIERS.

In [171]:
lf_copy = pd.merge( lf, shoe_info.loc[:, ['shoe_id', 'shoe']], on='shoe', how='left').\
merge(location_info.loc[:, ['location_id', 'location']], on='location', how='left')

In [172]:
lf_copy.head()

Unnamed: 0,day,shoe,value,location,shoe_id,location_id
0,1,W,12,N,4,2
1,2,W,5,N,4,2
2,3,W,9,N,4,2
3,4,W,4,N,4,2
4,1,B,5,N,1,2


now drop the `shoe` and `location` columns.

In [173]:
lf_copy.drop(columns=['shoe', 'location'], inplace=True)

In [174]:
lf_copy.head()

Unnamed: 0,day,value,shoe_id,location_id
0,1,12,4,2
1,2,5,4,2
2,3,9,4,2
3,4,4,4,2
4,1,5,1,2


In [175]:
lf_copy = lf_copy[['day', 'location_id', 'shoe_id', 'value']].copy()

In [176]:
lf_copy

Unnamed: 0,day,location_id,shoe_id,value
0,1,2,4,12
1,2,2,4,5
2,3,2,4,9
3,4,2,4,4
4,1,2,1,5
5,2,2,1,8
6,3,2,1,22
7,4,2,1,2
8,1,2,3,3
9,2,2,3,6


In [177]:
pd.merge( lf_copy, shoe_info, on='shoe_id', how='left').\
merge( location_info, on='location_id', how='left')

Unnamed: 0,day,location_id,shoe_id,value,shoe,shoe_color,location,location_name,street_address,city,state,zipcode
0,1,2,4,12,W,White,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
1,2,2,4,5,W,White,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
2,3,2,4,9,W,White,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
3,4,2,4,4,W,White,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
4,1,2,1,5,B,Black,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
5,2,2,1,8,B,Black,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
6,3,2,1,22,B,Black,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
7,4,2,1,2,B,Black,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
8,1,2,3,3,R,Red,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123
9,2,2,3,6,R,Red,N,Noodles & Company,3805 Forbes Ave,Pittsburgh,PA,15123


In [178]:
shoe_per_day = lf_copy.copy()

In [179]:
shoe_per_day

Unnamed: 0,day,location_id,shoe_id,value
0,1,2,4,12
1,2,2,4,5
2,3,2,4,9
3,4,2,4,4
4,1,2,1,5
5,2,2,1,8
6,3,2,1,22
7,4,2,1,2
8,1,2,3,3
9,2,2,3,6


In [180]:
shoe_per_day['id'] = shoe_per_day.index + 1

In [181]:
shoe_per_day.head()

Unnamed: 0,day,location_id,shoe_id,value,id
0,1,2,4,12,1
1,2,2,4,5,2
2,3,2,4,9,3
3,4,2,4,4,4
4,1,2,1,5,5


In [182]:
shoe_per_day = shoe_per_day[['id'] + lf_copy.columns.to_list()]

In [183]:
shoe_per_day.head()

Unnamed: 0,id,day,location_id,shoe_id,value
0,1,1,2,4,12
1,2,2,2,4,5
2,3,3,2,4,9
3,4,4,2,4,4
4,5,1,2,1,5


## Save the data sets

In [184]:
shoe_info.to_csv('shoe_info_table.csv', header=True, index=False)

In [185]:
location_info.to_csv('location_info_table.csv', header=True, index=False)

In [186]:
shoe_per_day.to_csv('shoes_per_day_table.csv', header=True, index=False)