In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pydataset import data

Paper by: Hadley Wickham

https://vita.had.co.nz/papers/tidy-data.pdf

### Data cleaning:

tidy data != clean data


- outlier checking  
- date parsing
- missing value imputation etc.
- <ins>data tidying: structuring datasets to facilitate analysis.</ins>



### Data semantics: tidy data

- Value: every value belongs to a variable and an observation.
- Variable: a variable contains all values that measure the same underlying attribute (like height, temperature, duration) across units.
- Observation: an observation contains all values measured on the same unit (like a person, or a day, or a race) across attributes.

####  datasets for lesson and exercises:

- https://classroom.google.com/c/MzUxODYzODI3NTE5/m/MzUxODYzODI3Njg5/details

In [2]:
# lets look at this data:
treatments = pd.read_csv('untidy-data/treatment.csv')
treatments

Unnamed: 0.1,Unnamed: 0,treatmenta,treatmentb,treatmentc
0,John Smith,,2,0
1,Jane Doe,16.0,11,3
2,Mary Johnson,3.0,1,4


In [3]:
#rename columns
treatments.columns = ['name', 'a', 'b', 'c']
treatments

Unnamed: 0,name,a,b,c
0,John Smith,,2,0
1,Jane Doe,16.0,11,3
2,Mary Johnson,3.0,1,4


What is an observation here?  
variables?  
values?  

In [None]:
# restructure data using 'melt'

treatments = treatments.melt(id_vars=['name'], var_name='treatment', value_name='response')

In [None]:
treatments

### Tidy data : 
- Each variable forms a column.
- Each observation forms a row.
- Each cell has a single value.
- data is tabular, i.e. made up of rows and columns

In [None]:
# Examples of tidy-data?

tips = data('tips')
tips.head()

#### General Ideas  
- If the units are the same, maybe they should be in the same column.
- If one column has measurements of different units, it should be spread out  
- Should you be able to groupby some of the columns? combine them  
- Can I pass this data to seaborn?  

- Can we ask interesting questions and answer them with a groupby? i.e. generally we don't want to be taking row or column averages.


### How to deal with 'messy' data


##### Reshaping data:  
  
- Wide data --> Long data format (Melt)  
- Long data --> Wide Data format (pivot_table, unstack)  

#### 1. Messy data: Column headers are values, not variable names.

In [None]:
df = pd.read_csv('untidy-data/pew.csv')

In [None]:
# look at info


In [None]:
# look at the head


In [None]:
# Melt the data. Creates two new columns with deault name of 'variable' and 'value'

pd.melt(df, id_vars= 'religion')

In [None]:
# melt data and specify names of new columns

df_tidy = 

#### pd.melt arguments
- id_vars = columns you want to keep (not melt)
- var_name = name of new column you created by melting columns
- value_name = column name for values

#### Another example: one variable stored across multiple columns

In [None]:
billboard = pd.read_csv('untidy-data/billboard.csv', encoding= 'unicode_escape')

In [None]:
billboard.head()

In [None]:
# what is mean rating for each track?


In [None]:
# melt the data

billboard_melt = 

In [None]:
# what is the mean rating (across all weeks) of each track?


#### 2. Messy data: Multiple variables are stored in one column.

In [None]:
df = pd.DataFrame({
    'name': ['Sally', 'Jane', 'Billy', 'Suzy'],
    'pet': ['dog: max', 'dog: buddy', 'cat: grizabella', 'hamster: fred']
})
df

In [None]:
# how to split the string
'dog: max'.split(':')

In [None]:
# how to split a pandas column/series


In [None]:
# create new columns


#### Messy data: Variables are stored in both rows and columns

In [None]:
weather = pd.read_csv('untidy-data/weather.csv')

weather.head()

In [None]:
# melt the 'days'

weather_long = 

In [None]:
# pivot the element column. Reset index to go from multi-index to flat dataframe.

weather_tidy =

#### pd.pivot_table arguments
- Index = columns you want to keep (not pivot)
- columns = column you want to pivot
- values = values we want to populate in the new columns
- aggfunct = how you want to aggregate the duplicate rows

## Mini Exercise:

- read in excel file named dem_score.xls
- convert the data in tidy format (hint: melt the data)
- convert the melted dataframe back in wide-format (hint: pivot the data)




In [60]:
# read in the excel file named dem_score.xlsx
df = pd.read_excel('untidy-data/dem_score.xlsx')
df.head()

Unnamed: 0,country,1952,1957,1962,1967,1972,1977,1982,1987,1992
0,Albania,-9,-9,-9,-9,-9,-9,-9,-9,5
1,Argentina,-9,-1,-1,-9,-9,-9,-8,8,7
2,Armenia,-9,-7,-7,-7,-7,-7,-7,-7,7
3,Australia,10,10,10,10,10,10,10,10,10
4,Austria,10,10,10,10,10,10,10,10,10


In [61]:
# is the data in tidy format? What is the shape of data

df.shape # data is not Tidy, it is in wide form
# shape is 96 rows and 10 columns


(96, 10)

In [62]:
# convert dataframe in 'tidy' format (hint: pd.melt)

df_melt = df.melt(id_vars='country', var_name='year', value_name='count')
df_melt.head()

Unnamed: 0,country,year,count
0,Albania,1952,-9
1,Argentina,1952,-9
2,Armenia,1952,-9
3,Australia,1952,10
4,Austria,1952,10


In [63]:
# check shape of tidy dataframe

df_melt.shape #rows increased from 96 to 864

(864, 3)

In [64]:
# convert the data back in wide format (hint: pivot_table)

df_pivot = df_melt.pivot_table(index='country', columns='year', values='count')
df_pivot = df_pivot.reset_index()
df_pivot.head()

year,country,1952,1957,1962,1967,1972,1977,1982,1987,1992
0,Albania,-9,-9,-9,-9,-9,-9,-9,-9,5
1,Argentina,-9,-1,-1,-9,-9,-9,-8,8,7
2,Armenia,-9,-7,-7,-7,-7,-7,-7,-7,7
3,Australia,10,10,10,10,10,10,10,10,10
4,Austria,10,10,10,10,10,10,10,10,10


#### Bit more complex example

In [55]:
sales = pd.read_csv('untidy-data/sales.csv')

In [56]:
sales.head()

Unnamed: 0,Product,2016 Sales,2016 PPU,2017 Sales,2017 PPU,2018 Sales,2018 PPU
0,A,673,5,231,7,173,9
1,B,259,3,748,5,186,8
2,C,644,3,863,5,632,5
3,D,508,9,356,11,347,14


In [57]:
# first we melt all the columns expect 'Product'

sales_melt = sales.melt(id_vars='Product', var_name='year_sales', value_name='sales_count')
sales_melt.head()

Unnamed: 0,Product,year_sales,sales_count
0,A,2016 Sales,673
1,B,2016 Sales,259
2,C,2016 Sales,644
3,D,2016 Sales,508
4,A,2016 PPU,5


In [58]:
# shape of melted dataset
sales_melt.shape

(24, 3)

In [75]:
# here we split the 'variable' column to create two new columns 'year' and 'measure'

sales_melt[['year','measure']] = sales_melt.year_sales.str.split(expand=True)
sales_melt = sales_melt.drop(columns='year_sales')
sales_melt.head()

Unnamed: 0,Product,sales_count,year,measure
0,A,673,2016,Sales
1,B,259,2016,Sales
2,C,644,2016,Sales
3,D,508,2016,Sales
4,A,5,2016,PPU


In [76]:
# now we will pivot the 'measure' column to create the two new columns 'PPU' and 'Sales'
# reset the index to flatten the dataframe 

sales_tidy = sales_melt.pivot_table(index=['Product','year'], 
                                    columns='measure',
                                    values='sales_count')
sales_tidy

Unnamed: 0_level_0,measure,PPU,Sales
Product,year,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2016,5,673
A,2017,7,231
A,2018,9,173
B,2016,3,259
B,2017,5,748
B,2018,8,186
C,2016,3,644
C,2017,5,863
C,2018,5,632
D,2016,9,508
