In [1]:
import pandas as pd
import numpy as np
%config Completer.use_jedi=False

## Reshaping by pivoting DataFrame objects
---
![](https://pandas.pydata.org/pandas-docs/stable/_images/reshaping_pivot.png)

In [2]:
import pandas._testing as tm


In [23]:
df = tm.makeTimeDataFrame(3)
print(df)
print(df.shape)
print(df.columns)
print(df.to_numpy().ravel('C'))
print(df.to_numpy().ravel('F'))
print(np.asarray(df.columns).repeat(3))
print(df.index)
print(np.tile(np.asarray(df.index),4))

                   A         B         C         D
2000-01-03 -1.229546  1.391295 -0.089412  0.145683
2000-01-04 -0.634924  1.485828 -1.075782 -0.103150
2000-01-05 -2.104042  0.824173 -0.139330  0.872891
(3, 4)
Index(['A', 'B', 'C', 'D'], dtype='object')
[-1.2295464   1.3912948  -0.08941171  0.145683   -0.63492411  1.48582758
 -1.07578192 -0.10314972 -2.10404167  0.82417256 -0.13933033  0.87289088]
[-1.2295464  -0.63492411 -2.10404167  1.3912948   1.48582758  0.82417256
 -0.08941171 -1.07578192 -0.13933033  0.145683   -0.10314972  0.87289088]
['A' 'A' 'A' 'B' 'B' 'B' 'C' 'C' 'C' 'D' 'D' 'D']
DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05'], dtype='datetime64[ns]', freq='B')
['2000-01-03T00:00:00.000000000' '2000-01-04T00:00:00.000000000'
 '2000-01-05T00:00:00.000000000' '2000-01-03T00:00:00.000000000'
 '2000-01-04T00:00:00.000000000' '2000-01-05T00:00:00.000000000'
 '2000-01-03T00:00:00.000000000' '2000-01-04T00:00:00.000000000'
 '2000-01-05T00:00:00.000000000' '2000-01-03T00:0

In [32]:
# create dummy table
def unpivot(frame):
    R, C = frame.shape
    data = {
        'date': np.tile(np.asarray(frame.index), C),
        'variable': np.asanyarray(frame.columns).repeat(R),
        'value': frame.to_numpy().ravel('F')
    }
    return pd.DataFrame(data, columns=['date', 'variable', 'value'])

df = unpivot(tm.makeTimeDataFrame(3))

In [36]:
df.pivot(index='date', columns='variable', values='value')

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,0.158199,0.556451,0.4194,-0.088833
2000-01-04,0.458526,0.492148,1.14492,-1.26303
2000-01-05,-1.470763,-0.736063,0.514486,0.522336


In [47]:
df['value2'] = df['value'] * 2
print(df.pivot(index='date', columns='variable', values='value'))
pivoted = df.pivot(index='date', columns='variable')
pivoted

variable           A         B         C         D
date                                              
2000-01-03  0.158199  0.556451  0.419400 -0.088833
2000-01-04  0.458526  0.492148  1.144920 -1.263030
2000-01-05 -1.470763 -0.736063  0.514486  0.522336


Unnamed: 0_level_0,value,value,value,value,value2,value2,value2,value2
variable,A,B,C,D,A,B,C,D
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2000-01-03,0.158199,0.556451,0.4194,-0.088833,0.316399,1.112903,0.8388,-0.177666
2000-01-04,0.458526,0.492148,1.14492,-1.26303,0.917052,0.984296,2.289841,-2.52606
2000-01-05,-1.470763,-0.736063,0.514486,0.522336,-2.941526,-1.472127,1.028972,1.044671


In [50]:
pivoted['value2']

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,0.316399,1.112903,0.8388,-0.177666
2000-01-04,0.917052,0.984296,2.289841,-2.52606
2000-01-05,-2.941526,-1.472127,1.028972,1.044671


## Reshaping by stacking and unstacking
### stack
![](https://pandas.pydata.org/pandas-docs/stable/_images/reshaping_stack.png)

### unstack
![](https://pandas.pydata.org/pandas-docs/stable/_images/reshaping_unstack.png)

In [57]:
tuples = list(
        zip(
            *[
                ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
                ["one", "two", "one", "two", "one", "two", "one", "two"],
            ]
        )
    )
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.043161,-1.554523
bar,two,-0.820284,1.609248
baz,one,1.809759,0.205969
baz,two,1.87455,0.650061
foo,one,-0.425982,-1.153167
foo,two,0.395402,0.462885
qux,one,-0.916431,-1.052898
qux,two,-0.082585,1.03488


In [60]:
df.stack()

first  second   
bar    one     A    0.043161
               B   -1.554523
       two     A   -0.820284
               B    1.609248
baz    one     A    1.809759
               B    0.205969
       two     A    1.874550
               B    0.650061
foo    one     A   -0.425982
               B   -1.153167
       two     A    0.395402
               B    0.462885
qux    one     A   -0.916431
               B   -1.052898
       two     A   -0.082585
               B    1.034880
dtype: float64

In [66]:
# by default unstacks the last level:
df.unstack()

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,0.043161,-0.820284,-1.554523,1.609248
baz,1.809759,1.87455,0.205969,0.650061
foo,-0.425982,0.395402,-1.153167,0.462885
qux,-0.916431,-0.082585,-1.052898,1.03488


In [68]:
df.unstack(0)

Unnamed: 0_level_0,A,A,A,A,B,B,B,B
first,bar,baz,foo,qux,bar,baz,foo,qux
second,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
one,0.043161,1.809759,-0.425982,-0.916431,-1.554523,0.205969,-1.153167,-1.052898
two,-0.820284,1.87455,0.395402,-0.082585,1.609248,0.650061,0.462885,1.03488


In [69]:
df.unstack(1)

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,0.043161,-0.820284,-1.554523,1.609248
baz,1.809759,1.87455,0.205969,0.650061
foo,-0.425982,0.395402,-1.153167,0.462885
qux,-0.916431,-0.082585,-1.052898,1.03488


In [71]:
df.unstack('second')

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,0.043161,-0.820284,-1.554523,1.609248
baz,1.809759,1.87455,0.205969,0.650061
foo,-0.425982,0.395402,-1.153167,0.462885
qux,-0.916431,-0.082585,-1.052898,1.03488


## Multiple levels
---

In [72]:
columns = pd.MultiIndex.from_tuples(
        [
            ("A", "cat", "long"),
            ("B", "cat", "long"),
            ("A", "dog", "short"),
            ("B", "dog", "short"),
        ],
        names=["exp", "animal", "hair_length"],
    )


MultiIndex([('A', 'cat',  'long'),
            ('B', 'cat',  'long'),
            ('A', 'dog', 'short'),
            ('B', 'dog', 'short')],
           names=['exp', 'animal', 'hair_length'])

In [73]:
df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
df

exp,A,B,A,B
animal,cat,cat,dog,dog
hair_length,long,long,short,short
0,1.600829,-0.543773,-0.646046,-1.566344
1,2.242153,-0.551422,-0.85023,-0.384097
2,-0.997494,-0.1064,0.090629,0.838314
3,0.590194,0.796218,-0.491236,-0.119021


In [75]:
df.stack(level=['animal', 'hair_length'])

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
Unnamed: 0_level_1,animal,hair_length,Unnamed: 3_level_1,Unnamed: 4_level_1
0,cat,long,1.600829,-0.543773
0,dog,short,-0.646046,-1.566344
1,cat,long,2.242153,-0.551422
1,dog,short,-0.85023,-0.384097
2,cat,long,-0.997494,-0.1064
2,dog,short,0.090629,0.838314
3,cat,long,0.590194,0.796218
3,dog,short,-0.491236,-0.119021


In [76]:
df.stack(level=[1,2])

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
Unnamed: 0_level_1,animal,hair_length,Unnamed: 3_level_1,Unnamed: 4_level_1
0,cat,long,1.600829,-0.543773
0,dog,short,-0.646046,-1.566344
1,cat,long,2.242153,-0.551422
1,dog,short,-0.85023,-0.384097
2,cat,long,-0.997494,-0.1064
2,dog,short,0.090629,0.838314
3,cat,long,0.590194,0.796218
3,dog,short,-0.491236,-0.119021


## Missing data
---

In [77]:
columns = pd.MultiIndex.from_tuples(
        [
            ("A", "cat"),
            ("B", "dog"),
            ("B", "cat"),
            ("A", "dog"),
        ],
        names=["exp", "animal"],
    )
index = pd.MultiIndex.from_product([("bar", "baz", "foo", "qux"), ("one", "two")], names=["first", "second"])
df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns)
df

Unnamed: 0_level_0,exp,A,B,B,A
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,0.259623,0.347766,-0.816513,0.777861
bar,two,-1.157936,-0.175472,-0.347472,0.876489
baz,one,0.275194,0.27129,1.000178,-1.052663
baz,two,-0.822972,0.142974,0.687843,0.135429
foo,one,-0.19032,-1.085831,-1.67996,-0.33676
foo,two,-1.184538,2.008237,-0.402711,1.129253
qux,one,-1.885936,-0.672341,-0.998001,-0.727207
qux,two,1.990918,0.295378,-0.901708,1.600366


In [78]:
df.iloc[[0, 1, 2, 4, 5, 7]]

Unnamed: 0_level_0,exp,A,B,B,A
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,0.259623,0.347766,-0.816513,0.777861
bar,two,-1.157936,-0.175472,-0.347472,0.876489
baz,one,0.275194,0.27129,1.000178,-1.052663
foo,one,-0.19032,-1.085831,-1.67996,-0.33676
foo,two,-1.184538,2.008237,-0.402711,1.129253
qux,two,1.990918,0.295378,-0.901708,1.600366


In [79]:
df2 = df.iloc[[0, 1, 2, 4, 5, 7]]
df2.stack('exp')

Unnamed: 0_level_0,Unnamed: 1_level_0,animal,cat,dog
first,second,exp,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,A,0.259623,0.777861
bar,one,B,-0.816513,0.347766
bar,two,A,-1.157936,0.876489
bar,two,B,-0.347472,-0.175472
baz,one,A,0.275194,-1.052663
baz,one,B,1.000178,0.27129
foo,one,A,-0.19032,-0.33676
foo,one,B,-1.67996,-1.085831
foo,two,A,-1.184538,1.129253
foo,two,B,-0.402711,2.008237


In [80]:
df2.stack("animal")

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
first,second,animal,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,cat,0.259623,-0.816513
bar,one,dog,0.777861,0.347766
bar,two,cat,-1.157936,-0.347472
bar,two,dog,0.876489,-0.175472
baz,one,cat,0.275194,1.000178
baz,one,dog,-1.052663,0.27129
foo,one,cat,-0.19032,-1.67996
foo,one,dog,-0.33676,-1.085831
foo,two,cat,-1.184538,-0.402711
foo,two,dog,1.129253,2.008237


In [81]:
df3 = df.iloc[[0, 1, 4, 7], [1, 2]]
print(df3)
df3.unstack()

exp                  B          
animal             dog       cat
first second                    
bar   one     0.347766 -0.816513
      two    -0.175472 -0.347472
foo   one    -1.085831 -1.679960
qux   two     0.295378 -0.901708


exp,B,B,B,B
animal,dog,dog,cat,cat
second,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
bar,0.347766,-0.175472,-0.816513,-0.347472
foo,-1.085831,,-1.67996,
qux,,0.295378,,-0.901708


## Reshaping by melt
---
![](https://pandas.pydata.org/pandas-docs/stable/_images/reshaping_melt.png)

In [83]:
cheese = pd.DataFrame(
        {
            "first": ["John", "Mary"],
            "last": ["Doe", "Bo"],
            "height": [5.5, 6.0],
            "weight": [130, 150],
        }
    )
cheese

Unnamed: 0,first,last,height,weight
0,John,Doe,5.5,130
1,Mary,Bo,6.0,150


In [85]:
cheese.melt(id_vars=['first', 'last'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [86]:
cheese.melt(id_vars=["first", "last"], var_name="quantity")

Unnamed: 0,first,last,quantity,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [87]:
index = pd.MultiIndex.from_tuples([("person", "A"), ("person", "B")])
cheese = pd.DataFrame(
        {
            "first": ["John", "Mary"],
            "last": ["Doe", "Bo"],
            "height": [5.5, 6.0],
            "weight": [130, 150],
        },
        index=index,
    )
cheese

Unnamed: 0,Unnamed: 1,first,last,height,weight
person,A,John,Doe,5.5,130
person,B,Mary,Bo,6.0,150


In [88]:
cheese.melt(id_vars=["first", "last"])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [89]:
cheese.melt(id_vars=["first", "last"], ignore_index=False)

Unnamed: 0,Unnamed: 1,first,last,variable,value
person,A,John,Doe,height,5.5
person,B,Mary,Bo,height,6.0
person,A,John,Doe,weight,130.0
person,B,Mary,Bo,weight,150.0


In [92]:
dft = pd.DataFrame(
        {
            "A1970": {0: "a", 1: "b", 2: "c"},
            "A1980": {0: "d", 1: "e", 2: "f"},
            "B1970": {0: 2.5, 1: 1.2, 2: 0.7},
            "B1980": {0: 3.2, 1: 1.3, 2: 0.1},
            "X": dict(zip(range(3), np.random.randn(3))),
        }
    )
print(dft)
dft["id"] = dft.index
dft

  A1970 A1980  B1970  B1980         X
0     a     d    2.5    3.2  0.491276
1     b     e    1.2    1.3 -1.406375
2     c     f    0.7    0.1  0.637605


Unnamed: 0,A1970,A1980,B1970,B1980,X,id
0,a,d,2.5,3.2,0.491276,0
1,b,e,1.2,1.3,-1.406375,1
2,c,f,0.7,0.1,0.637605,2


In [93]:
pd.wide_to_long(dft, ["A", "B"], i="id", j="year")

Unnamed: 0_level_0,Unnamed: 1_level_0,X,A,B
id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1970,0.491276,a,2.5
1,1970,-1.406375,b,1.2
2,1970,0.637605,c,0.7
0,1980,0.491276,d,3.2
1,1980,-1.406375,e,1.3
2,1980,0.637605,f,0.1


## Combining with stats and GroupBy
---

## Pivot tables
---

In [95]:
import datetime
df = pd.DataFrame(
        {
            "A": ["one", "one", "two", "three"] * 6,
            "B": ["A", "B", "C"] * 8,
            "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
            "D": np.random.randn(24),
            "E": np.random.randn(24),
            "F": [datetime.datetime(2013, i, 1) for i in range(1, 13)]
            + [datetime.datetime(2013, i, 15) for i in range(1, 13)],
        }
    )
df

Unnamed: 0,A,B,C,D,E,F
0,one,A,foo,0.829658,-1.391325,2013-01-01
1,one,B,foo,0.261204,1.365327,2013-02-01
2,two,C,foo,1.109227,-0.364191,2013-03-01
3,three,A,bar,-0.376367,-2.813897,2013-04-01
4,one,B,bar,1.307618,1.099181,2013-05-01
5,one,C,bar,1.072084,-0.42896,2013-06-01
6,two,A,foo,-0.603552,-0.993419,2013-07-01
7,three,B,foo,0.032107,-1.41959,2013-08-01
8,one,C,foo,1.530825,0.543971,2013-09-01
9,one,A,bar,0.959526,0.052077,2013-10-01


In [97]:
pd.pivot_table(df, values="D", index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.087668,1.45647
one,B,1.394712,0.153734
one,C,-0.011354,0.780821
three,A,0.205018,
three,B,,0.188606
three,C,-1.182397,
two,A,,-0.181606
two,B,0.173364,
two,C,,0.282819


In [98]:
pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum)

A,one,one,three,three,two,two
C,bar,foo,bar,foo,bar,foo
B,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0.175335,2.91294,0.410037,,,-0.363211
B,2.789424,0.307468,,0.377212,0.346727,
C,-0.022708,1.561643,-2.364794,,,0.565639


In [100]:
pd.pivot_table(df, values=['D', 'E'], index=["B"], columns=["A", "C"], aggfunc=np.sum)

Unnamed: 0_level_0,D,D,D,D,D,D,E,E,E,E,E,E
A,one,one,three,three,two,two,one,one,three,three,two,two
C,bar,foo,bar,foo,bar,foo,bar,foo,bar,foo,bar,foo
B,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
A,0.175335,2.91294,0.410037,,,-0.363211,1.981502,-2.135079,-2.064649,,,-1.384975
B,2.789424,0.307468,,0.377212,0.346727,,0.487157,1.393975,,-1.43256,2.995833,
C,-0.022708,1.561643,-2.364794,,,0.565639,-0.380405,-1.500794,1.130843,,,-0.046104


In [102]:
pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C")

C,bar,foo
F,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-31,,1.45647
2013-02-28,,0.153734
2013-03-31,,0.282819
2013-04-30,0.205018,
2013-05-31,1.394712,
2013-06-30,-0.011354,
2013-07-31,,-0.181606
2013-08-31,,0.188606
2013-09-30,,0.780821
2013-10-31,0.087668,


## Adding margins
---

In [103]:
df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,D,E,E,E
Unnamed: 0_level_1,C,bar,foo,All,bar,foo,All
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
one,A,1.232994,0.886445,1.18035,1.327485,0.457902,1.43857
one,B,0.123171,0.151985,0.725327,1.210005,0.945175,0.92431
one,C,1.532213,1.060665,1.169078,0.337654,1.830512,1.12229
three,A,0.822203,,0.822203,2.519524,,2.519524
three,B,,0.221323,0.221323,,0.994631,0.994631
three,C,1.533037,,1.533037,0.091901,,0.091901
two,A,,0.596723,0.596723,,0.425581,0.425581
two,B,0.656746,,0.656746,0.320548,,0.320548
two,C,,1.168717,1.168717,,0.482444,0.482444
All,,1.129712,0.80428,0.953725,1.271242,0.955751,1.14372


## Tiling
---

In [104]:
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
pd.cut(ages, bins=3)

[(9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (26.667, 43.333], (43.333, 60.0], (43.333, 60.0]]
Categories (3, interval[float64]): [(9.95, 26.667] < (26.667, 43.333] < (43.333, 60.0]]

## Computing indicator / dummy variables
---

In [107]:
df = pd.DataFrame({"key": list("bbacab"), "data1": range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [110]:
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [111]:
pd.get_dummies(df["key"], prefix="key")

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [113]:
df[["data1"]].join(pd.get_dummies(df["key"], prefix="key"))

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0
