In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
## Creating a DataFrame
dates = pd.date_range("20130101", periods = 6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
## Creating a DataFrame
df1 = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list("ABCD"))
df1

Unnamed: 0,A,B,C,D
2013-01-01,-0.648955,-0.669339,-0.079361,-0.363724
2013-01-02,-0.908517,-0.422272,0.76658,-0.157948
2013-01-03,0.023069,0.156762,0.611628,-1.339892
2013-01-04,1.098549,-2.735048,0.25563,0.836162
2013-01-05,-0.776153,0.294348,1.284883,-1.307682
2013-01-06,1.736154,-0.780714,-0.703217,0.48775


# Grouping

In [5]:
df = pd.DataFrame({"A": ["foo", "bar", "foo", "bar",
                         "foo", "bar", "foo", "foo"],
                   "B": ["one", "one", "two", "three",
                         "two", "two", "one", "three"],
                   "C": np.random.randn(8),
                   "D": np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.526539,0.681885
1,bar,one,2.499149,0.642857
2,foo,two,1.660234,1.496484
3,bar,three,-0.657639,-0.480141
4,foo,two,1.198502,0.136325
5,bar,two,-0.616255,-0.626335
6,foo,one,-1.253408,0.295741
7,foo,three,-0.684398,0.48997


In [6]:
# grouping

df.groupby("A").sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.225255,-0.46362
foo,1.447468,3.100405


In [7]:
# groupby hierarchical index

df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,2.499149,0.642857
bar,three,-0.657639,-0.480141
bar,two,-0.616255,-0.626335
foo,one,-0.726869,0.977626
foo,three,-0.684398,0.48997
foo,two,2.858736,1.632809


# Reshaping

In [8]:
# Stack

tuples = list(zip(*[["bar", "bar", "baz", "baz",
                    "foo", "foo", "qux", "qux"],
                    ["one", "two", "one", "two",
                     "one", "two", "one", "two"]]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [9]:
index = pd.MultiIndex.from_tuples(tuples, names = ["first", "second"])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [10]:
df = pd.DataFrame(np.random.randn(8, 2), index = index, columns = ["A", "B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.028292,1.328012
bar,two,0.950219,-0.461188
baz,one,-1.204071,-0.108049
baz,two,-0.867045,-0.237393
foo,one,0.250907,1.510669
foo,two,2.282618,-1.17185
qux,one,-1.512706,1.254703
qux,two,0.918788,-0.11809


In [11]:
# stack method

stacked = df.stack()
stacked

first  second   
bar    one     A    0.028292
               B    1.328012
       two     A    0.950219
               B   -0.461188
baz    one     A   -1.204071
               B   -0.108049
       two     A   -0.867045
               B   -0.237393
foo    one     A    0.250907
               B    1.510669
       two     A    2.282618
               B   -1.171850
qux    one     A   -1.512706
               B    1.254703
       two     A    0.918788
               B   -0.118090
dtype: float64

In [12]:
# unstack

stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.028292,1.328012
bar,two,0.950219,-0.461188
baz,one,-1.204071,-0.108049
baz,two,-0.867045,-0.237393
foo,one,0.250907,1.510669
foo,two,2.282618,-1.17185
qux,one,-1.512706,1.254703
qux,two,0.918788,-0.11809


In [13]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.028292,0.950219
bar,B,1.328012,-0.461188
baz,A,-1.204071,-0.867045
baz,B,-0.108049,-0.237393
foo,A,0.250907,2.282618
foo,B,1.510669,-1.17185
qux,A,-1.512706,0.918788
qux,B,1.254703,-0.11809


In [14]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz,foo,qux
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,A,0.028292,-1.204071,0.250907,-1.512706
one,B,1.328012,-0.108049,1.510669,1.254703
two,A,0.950219,-0.867045,2.282618,0.918788
two,B,-0.461188,-0.237393,-1.17185,-0.11809


# Pivot Tables

In [15]:
df = pd.DataFrame({"A" : ["one", "one", "two", "three"] * 3,
                   "B" : ["A", "B", "C"] * 4,
                   "C" : ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
                   "D" : np.random.random(12),
                   "E" : np.random.random(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.808089,0.443641
1,one,B,foo,0.323254,0.360207
2,two,C,foo,0.016289,0.455138
3,three,A,bar,0.487872,0.629827
4,one,B,bar,0.949959,0.435963
5,one,C,bar,0.655172,0.534897
6,two,A,foo,0.622017,0.636007
7,three,B,foo,0.963007,0.553717
8,one,C,foo,0.634961,0.693194
9,one,A,bar,0.758862,0.616295


In [16]:
pd.pivot_table(df, values = "D", index = ["A", "B"], columns = ["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.758862,0.808089
one,B,0.949959,0.323254
one,C,0.655172,0.634961
three,A,0.487872,
three,B,,0.963007
three,C,0.586666,
two,A,,0.622017
two,B,0.177241,
two,C,,0.016289


# Time Series

In [17]:
rng = pd.date_range("1/1/2012", periods = 100, freq = "S")
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-

In [18]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index = rng)
ts

2012-01-01 00:00:00    258
2012-01-01 00:00:01    318
2012-01-01 00:00:02     61
2012-01-01 00:00:03    472
2012-01-01 00:00:04    154
                      ... 
2012-01-01 00:01:35    131
2012-01-01 00:01:36    146
2012-01-01 00:01:37    376
2012-01-01 00:01:38    210
2012-01-01 00:01:39    394
Freq: S, Length: 100, dtype: int64

In [19]:
# Time zone representation

rng = pd.date_range("3/6/2012 00:00", periods = 5, freq = "D")
rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

In [20]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2012-03-06    0.232287
2012-03-07   -0.629850
2012-03-08   -0.027348
2012-03-09   -0.548201
2012-03-10   -0.359238
Freq: D, dtype: float64

In [21]:
ts_utc = ts.tz_localize("UTC")
ts_utc

2012-03-06 00:00:00+00:00    0.232287
2012-03-07 00:00:00+00:00   -0.629850
2012-03-08 00:00:00+00:00   -0.027348
2012-03-09 00:00:00+00:00   -0.548201
2012-03-10 00:00:00+00:00   -0.359238
Freq: D, dtype: float64

In [22]:
ts_utc.tz_convert("US/Eastern")

2012-03-05 19:00:00-05:00    0.232287
2012-03-06 19:00:00-05:00   -0.629850
2012-03-07 19:00:00-05:00   -0.027348
2012-03-08 19:00:00-05:00   -0.548201
2012-03-09 19:00:00-05:00   -0.359238
Freq: D, dtype: float64

In [23]:
rng = pd.date_range("1/1/2012", periods = 5, freq = "M")
rng

DatetimeIndex(['2012-01-31', '2012-02-29', '2012-03-31', '2012-04-30',
               '2012-05-31'],
              dtype='datetime64[ns]', freq='M')

In [24]:
ts = pd.Series(np.random.randn(len(rng)), index = rng)
ts

2012-01-31   -0.525298
2012-02-29    0.670314
2012-03-31    0.608369
2012-04-30   -0.290721
2012-05-31   -0.567854
Freq: M, dtype: float64

In [25]:
ps = ts.to_period()
ps

2012-01   -0.525298
2012-02    0.670314
2012-03    0.608369
2012-04   -0.290721
2012-05   -0.567854
Freq: M, dtype: float64

In [26]:
ps.to_timestamp()

2012-01-01   -0.525298
2012-02-01    0.670314
2012-03-01    0.608369
2012-04-01   -0.290721
2012-05-01   -0.567854
Freq: MS, dtype: float64

In [27]:
prng = pd.period_range("1990Q1", "2000Q4", freq = "Q-NOV")
prng

PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
             '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
             '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
             '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
             '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
             '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
             '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
             '2000Q3', '2000Q4'],
            dtype='period[Q-NOV]')

In [28]:
ts = pd.Series(np.random.randn(len(prng)), prng)
ts

1990Q1   -0.057723
1990Q2   -0.989539
1990Q3    0.956760
1990Q4    1.688280
1991Q1    1.212365
1991Q2    1.832722
1991Q3   -0.810978
1991Q4   -1.195899
1992Q1    1.574106
1992Q2   -0.907143
1992Q3    0.388014
1992Q4    0.780891
1993Q1   -1.210838
1993Q2   -0.868866
1993Q3   -0.787379
1993Q4   -1.311435
1994Q1    1.084740
1994Q2   -0.455752
1994Q3    1.302917
1994Q4    1.293255
1995Q1    0.560859
1995Q2   -1.046279
1995Q3   -0.759925
1995Q4   -0.913024
1996Q1   -0.938868
1996Q2   -0.950703
1996Q3   -0.500037
1996Q4   -0.025479
1997Q1   -0.057239
1997Q2    0.784972
1997Q3    0.547592
1997Q4    2.497819
1998Q1   -0.792562
1998Q2   -0.314206
1998Q3    0.481906
1998Q4    0.208585
1999Q1   -0.174309
1999Q2    0.361788
1999Q3    0.745441
1999Q4   -0.449199
2000Q1   -0.278589
2000Q2   -0.701618
2000Q3    0.057727
2000Q4    0.307094
Freq: Q-NOV, dtype: float64

In [29]:
ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9
ts.head()

1990-03-01 09:00   -0.057723
1990-06-01 09:00   -0.989539
1990-09-01 09:00    0.956760
1990-12-01 09:00    1.688280
1991-03-01 09:00    1.212365
Freq: H, dtype: float64

# Categoricals

In [30]:
df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]})
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [31]:
# categorical data

df["grade"] = df["raw_grade"].astype("category")
df

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [32]:
# Rename Categories

df["grade"].cat.categories = ["very good", "good", "very bad"]
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [33]:
# Sorting

df.sort_values(by = "grade")

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
3,4,a,very good
4,5,a,very good
1,2,b,good
2,3,b,good
5,6,e,very bad


In [34]:
# Grouping

df.groupby("grade").size()

grade
very good    3
good         2
very bad     1
dtype: int64