## Reshaping Data w/ groupby and crosstab

### Crosstab: only uses arrays when creating dataframes

In [1]:
import pandas as pd
from pydataset import data

In [2]:
df = data('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
#crosstab will take in arrays 

pd.crosstab(df.sex, df.day)

day,Fri,Sat,Sun,Thur
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,9,28,18,32
Male,10,59,58,30


In [4]:
pd.crosstab([df.day, df.smoker],df.time) #if we syntactically insert df columns as a array, it works too!

Unnamed: 0_level_0,time,Dinner,Lunch
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3,1
Fri,Yes,9,6
Sat,No,45,0
Sat,Yes,42,0
Sun,No,57,0
Sun,Yes,19,0
Thur,No,1,44
Thur,Yes,0,17


In [6]:
pd.DataFrame=(df.groupby(['day','smoker', 'time']).time.count())

### Pivot Table Method: we call off the dataframe itself
#### df.pivot_table (index='classrom', columns='passing_math', values= "math")

In [18]:
#Pivot table computation defaluts to mean
df.pivot_table(index='day', columns='time', values='tip')

time,Dinner,Lunch
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,2.94,2.382857
Sat,2.993103,
Sun,3.255132,
Thur,3.0,2.767705


In [19]:
df.pivot_table(index='day', columns='time', values='tip', aggfunc='median')

time,Dinner,Lunch
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,3.0,2.2
Sat,2.75,
Sun,3.15,
Thur,3.0,2.3


In [20]:
df[(df.time == 'Lunch') & (df.day == 'Sun')].mean()  #not a number, so NaN shows 

total_bill    NaN
tip           NaN
sex           NaN
smoker        NaN
day           NaN
time          NaN
size          NaN
dtype: object

In [21]:
df.pivot_table(index='day', columns='time', values='tip', aggfunc=['mean', 'median'])

Unnamed: 0_level_0,mean,mean,median,median
time,Dinner,Lunch,Dinner,Lunch
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Fri,2.94,2.382857,3.0,2.2
Sat,2.993103,,2.75,
Sun,3.255132,,3.15,
Thur,3.0,2.767705,3.0,2.3


In [22]:
df.groupby(['time','day']).tip.mean() #this creates a multi-level index

time    day 
Dinner  Fri     2.940000
        Sat     2.993103
        Sun     3.255132
        Thur    3.000000
Lunch   Fri     2.382857
        Thur    2.767705
Name: tip, dtype: float64

In [5]:
pd.DataFrame(df.groupby(['time','day']).tip.mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
time,day,Unnamed: 2_level_1
Dinner,Fri,2.94
Dinner,Sat,2.993103
Dinner,Sun,3.255132
Dinner,Thur,3.0
Lunch,Fri,2.382857
Lunch,Thur,2.767705


In [12]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


In [13]:
df.T #transpose means to change data onto its side

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,235,236,237,238,239,240,241,242,243,244
total_bill,16.99,10.34,21.01,23.68,24.59,25.29,8.77,26.88,15.04,14.78,...,15.53,10.07,12.6,32.83,35.83,29.03,27.18,22.67,17.82,18.78
tip,1.01,1.66,3.5,3.31,3.61,4.71,2.0,3.12,1.96,3.23,...,3.0,1.25,1.0,1.17,4.67,5.92,2.0,2.0,1.75,3.0
sex,Female,Male,Male,Male,Female,Male,Male,Male,Male,Male,...,Male,Male,Male,Male,Female,Male,Female,Male,Male,Female
smoker,No,No,No,No,No,No,No,No,No,No,...,Yes,No,Yes,Yes,No,No,Yes,Yes,No,No
day,Sun,Sun,Sun,Sun,Sun,Sun,Sun,Sun,Sun,Sun,...,Sat,Sat,Sat,Sat,Sat,Sat,Sat,Sat,Sat,Thur
time,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner,...,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner,Dinner
size,2,3,3,2,4,4,2,4,2,2,...,2,2,2,2,3,3,2,2,2,2


In [3]:
##When do we use transpositions?? 
# When using describe or groupby, it can help the user read the data better

#example:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0
