In [1]:
import numpy as np
import pandas as pd

In [2]:
tuples = list(
        zip(
            *[
                ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], 
                ["one", "two", "one", "two", "one", "two", "one", "two"], 
            ]))

index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

In [4]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [7]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.42318,0.128522
bar,two,0.476546,0.987772
baz,one,-0.233538,-1.613717
baz,two,-0.818391,-0.766759
foo,one,0.494105,-0.216617
foo,two,0.676298,-2.412386
qux,one,-2.563963,0.548738
qux,two,0.449537,-2.176408


In [9]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.42318,0.128522
bar,two,0.476546,0.987772
baz,one,-0.233538,-1.613717
baz,two,-0.818391,-0.766759


In [11]:
stacked = df2.stack()
stacked

first  second   
bar    one     A   -0.423180
               B    0.128522
       two     A    0.476546
               B    0.987772
baz    one     A   -0.233538
               B   -1.613717
       two     A   -0.818391
               B   -0.766759
dtype: float64

In [18]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.42318,-0.233538
one,B,0.128522,-1.613717
two,A,0.476546,-0.818391
two,B,0.987772,-0.766759


In [19]:
df = pd.DataFrame(
    {
        "A":["one", "one", "two", "three"] * 3, 
        "B":["A", "B", "C"] * 4, 
        "C":["foo", "foo", "foo", "bar", "bar", "bar"] * 2, 
        "D":np.random.randn(12), 
        "E":np.random.randn(12),
    })

df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.345533,0.88296
1,one,B,foo,0.610805,-0.15827
2,two,C,foo,0.834988,1.67972
3,three,A,bar,0.783287,-0.637413
4,one,B,bar,1.612932,-0.889584
5,one,C,bar,1.187076,0.863334
6,two,A,foo,1.937373,-0.43236
7,three,B,foo,-0.045829,0.86828
8,one,C,foo,-1.93757,-1.240921
9,one,A,bar,-0.632178,-1.15171


In [22]:
pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.632178,-0.345533
one,B,1.612932,0.610805
one,C,1.187076,-1.93757
three,A,0.783287,
three,B,,-0.045829
three,C,1.368225,
two,A,,1.937373
two,B,0.544892,
two,C,,0.834988


In [33]:
rng = pd.date_range("1/1/2012", periods=3000, freq="S")
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               ...
               '2012-01-01 00:49:50', '2012-01-01 00:49:51',
               '2012-01-01 00:49:52', '2012-01-01 00:49:53',
               '2012-01-01 00:49:54', '2012-01-01 00:49:55',
               '2012-01-01 00:49:56', '2012-01-01 00:49:57',
               '2012-01-01 00:49:58', '2012-01-01 00:49:59'],
              dtype='datetime64[ns]', length=3000, freq='S')

In [34]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts

2012-01-01 00:00:00    464
2012-01-01 00:00:01    212
2012-01-01 00:00:02    279
2012-01-01 00:00:03    153
2012-01-01 00:00:04     45
                      ... 
2012-01-01 00:49:55    395
2012-01-01 00:49:56    100
2012-01-01 00:49:57    381
2012-01-01 00:49:58     56
2012-01-01 00:49:59    277
Freq: S, Length: 3000, dtype: int32

In [35]:
ts.resample("5Min").sum()

2012-01-01 00:00:00    76853
2012-01-01 00:05:00    70677
2012-01-01 00:10:00    75093
2012-01-01 00:15:00    74239
2012-01-01 00:20:00    71845
2012-01-01 00:25:00    77873
2012-01-01 00:30:00    76063
2012-01-01 00:35:00    73690
2012-01-01 00:40:00    71979
2012-01-01 00:45:00    73104
Freq: 5T, dtype: int32

In [36]:
rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D")
rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

In [37]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2012-03-06   -2.192989
2012-03-07   -1.102261
2012-03-08    0.752577
2012-03-09    0.220993
2012-03-10   -0.232500
Freq: D, dtype: float64

In [38]:
ts_utc = ts.tz_localize("UTC")
ts_utc

2012-03-06 00:00:00+00:00   -2.192989
2012-03-07 00:00:00+00:00   -1.102261
2012-03-08 00:00:00+00:00    0.752577
2012-03-09 00:00:00+00:00    0.220993
2012-03-10 00:00:00+00:00   -0.232500
Freq: D, dtype: float64

In [41]:
ts_utc.tz_convert("US/Eastern")

2012-03-05 19:00:00-05:00   -2.192989
2012-03-06 19:00:00-05:00   -1.102261
2012-03-07 19:00:00-05:00    0.752577
2012-03-08 19:00:00-05:00    0.220993
2012-03-09 19:00:00-05:00   -0.232500
Freq: D, dtype: float64

In [53]:
rng = pd.date_range("1/1/2012", periods=5, freq="m")
rng

DatetimeIndex(['2012-01-31', '2012-02-29', '2012-03-31', '2012-04-30',
               '2012-05-31'],
              dtype='datetime64[ns]', freq='M')

In [54]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2012-01-31    0.751873
2012-02-29   -0.916665
2012-03-31    1.881274
2012-04-30    0.249782
2012-05-31    1.997578
Freq: M, dtype: float64

In [55]:
ps = ts.to_period()
ps

2012-01    0.751873
2012-02   -0.916665
2012-03    1.881274
2012-04    0.249782
2012-05    1.997578
Freq: M, dtype: float64

In [57]:
ps.to_timestamp()

2012-01-01    0.751873
2012-02-01   -0.916665
2012-03-01    1.881274
2012-04-01    0.249782
2012-05-01    1.997578
Freq: MS, dtype: float64

In [58]:
prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV")
prng

PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
             '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
             '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
             '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
             '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
             '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
             '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
             '2000Q3', '2000Q4'],
            dtype='period[Q-NOV]', freq='Q-NOV')

In [59]:
ts = pd.Series(np.random.randn(len(prng)), prng)
ts

1990Q1   -0.481294
1990Q2    1.370599
1990Q3   -0.825853
1990Q4    0.368218
1991Q1    0.607073
1991Q2    0.590936
1991Q3   -0.752207
1991Q4   -1.047774
1992Q1   -0.233283
1992Q2   -0.078756
1992Q3    0.855250
1992Q4   -0.501370
1993Q1    1.276933
1993Q2    0.565063
1993Q3    0.596293
1993Q4   -0.217777
1994Q1    0.193297
1994Q2    0.723184
1994Q3    0.586532
1994Q4   -1.010894
1995Q1    1.383310
1995Q2   -1.326757
1995Q3    0.299336
1995Q4    0.972671
1996Q1    0.237288
1996Q2   -0.854098
1996Q3   -0.634873
1996Q4    0.752862
1997Q1    0.856730
1997Q2    0.237006
1997Q3   -0.096163
1997Q4   -0.757962
1998Q1   -0.995173
1998Q2    0.927104
1998Q3    0.263216
1998Q4    1.438274
1999Q1    0.163528
1999Q2   -1.003063
1999Q3   -0.709452
1999Q4   -1.476274
2000Q1    0.169187
2000Q2    1.561083
2000Q3   -1.334865
2000Q4    0.251168
Freq: Q-NOV, dtype: float64

In [64]:
ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9

In [65]:
ts

1990-03-01 09:00   -0.481294
1990-06-01 09:00    1.370599
1990-09-01 09:00   -0.825853
1990-12-01 09:00    0.368218
1991-03-01 09:00    0.607073
1991-06-01 09:00    0.590936
1991-09-01 09:00   -0.752207
1991-12-01 09:00   -1.047774
1992-03-01 09:00   -0.233283
1992-06-01 09:00   -0.078756
1992-09-01 09:00    0.855250
1992-12-01 09:00   -0.501370
1993-03-01 09:00    1.276933
1993-06-01 09:00    0.565063
1993-09-01 09:00    0.596293
1993-12-01 09:00   -0.217777
1994-03-01 09:00    0.193297
1994-06-01 09:00    0.723184
1994-09-01 09:00    0.586532
1994-12-01 09:00   -1.010894
1995-03-01 09:00    1.383310
1995-06-01 09:00   -1.326757
1995-09-01 09:00    0.299336
1995-12-01 09:00    0.972671
1996-03-01 09:00    0.237288
1996-06-01 09:00   -0.854098
1996-09-01 09:00   -0.634873
1996-12-01 09:00    0.752862
1997-03-01 09:00    0.856730
1997-06-01 09:00    0.237006
1997-09-01 09:00   -0.096163
1997-12-01 09:00   -0.757962
1998-03-01 09:00   -0.995173
1998-06-01 09:00    0.927104
1998-09-01 09:

In [81]:
df = pd.DataFrame(
    {"id":[1, 2, 3, 4, 5, 6], "raw_grade":["a", "b", "b", "a", "a", "e"]}
)
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [82]:
df["grade"] = df["raw_grade"].astype("category")

In [83]:
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [84]:
df["grade"].cat.categories = ["very good", "good", "very bad"]

In [85]:
df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): ['very good', 'good', 'very bad']

In [86]:
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [88]:
df["grade"] = df["grade"].cat.set_categories(
        ["very bad", "bad", "medium", "good", "very godd"]
)
df["grade"]

0         NaN
1        good
2        good
3         NaN
4         NaN
5    very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very godd']

In [90]:
df.sort_values(by = "grade")

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,
3,4,a,
4,5,a,


In [91]:
df.groupby("grade").size()

grade
very bad     1
bad          0
medium       0
good         2
very godd    0
dtype: int64

In [94]:
ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000))
ts = ts.cumsum()
df = pd.DataFrame(
    np.random.randn(1000, 4), index=ts.index, columns=["A", "B", "C", "D"])

In [95]:
df

Unnamed: 0,A,B,C,D
2000-01-01,1.505938,1.297101,-1.799368,-0.787923
2000-01-02,1.403883,1.047527,0.556933,0.988298
2000-01-03,1.536611,-1.204021,0.046499,0.478297
2000-01-04,-0.055356,0.064605,0.483896,1.124936
2000-01-05,1.827185,-0.017855,-0.557296,-1.288858
...,...,...,...,...
2002-09-22,2.527250,0.524636,-2.012743,0.048905
2002-09-23,0.842552,-2.117459,0.794136,1.097257
2002-09-24,0.649829,1.097682,0.400777,-0.579290
2002-09-25,0.071439,0.783035,-0.024798,1.080846


In [96]:
df = df.cumsum()
df

Unnamed: 0,A,B,C,D
2000-01-01,1.505938,1.297101,-1.799368,-0.787923
2000-01-02,2.909821,2.344628,-1.242435,0.200375
2000-01-03,4.446432,1.140608,-1.195935,0.678672
2000-01-04,4.391076,1.205213,-0.712040,1.803608
2000-01-05,6.218261,1.187357,-1.269336,0.514750
...,...,...,...,...
2002-09-22,-18.132549,-55.012413,-52.945863,-20.105767
2002-09-23,-17.289997,-57.129872,-52.151727,-19.008509
2002-09-24,-16.640168,-56.032190,-51.750950,-19.587799
2002-09-25,-16.568729,-55.249155,-51.775747,-18.506953


In [97]:
df.to_csv("foo.csv")
pd.read_csv("foo.csv")

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,2000-01-01,1.505938,1.297101,-1.799368,-0.787923
1,2000-01-02,2.909821,2.344628,-1.242435,0.200375
2,2000-01-03,4.446432,1.140608,-1.195935,0.678672
3,2000-01-04,4.391076,1.205213,-0.712040,1.803608
4,2000-01-05,6.218261,1.187357,-1.269336,0.514750
...,...,...,...,...,...
995,2002-09-22,-18.132549,-55.012413,-52.945863,-20.105767
996,2002-09-23,-17.289997,-57.129872,-52.151727,-19.008509
997,2002-09-24,-16.640168,-56.032190,-51.750950,-19.587799
998,2002-09-25,-16.568729,-55.249155,-51.775747,-18.506953


In [98]:
df.to_hdf("foo.h5", "df")
pd.read_hdf("foo.h5", "df")

Unnamed: 0,A,B,C,D
2000-01-01,1.505938,1.297101,-1.799368,-0.787923
2000-01-02,2.909821,2.344628,-1.242435,0.200375
2000-01-03,4.446432,1.140608,-1.195935,0.678672
2000-01-04,4.391076,1.205213,-0.712040,1.803608
2000-01-05,6.218261,1.187357,-1.269336,0.514750
...,...,...,...,...
2002-09-22,-18.132549,-55.012413,-52.945863,-20.105767
2002-09-23,-17.289997,-57.129872,-52.151727,-19.008509
2002-09-24,-16.640168,-56.032190,-51.750950,-19.587799
2002-09-25,-16.568729,-55.249155,-51.775747,-18.506953


In [99]:
df.to_excel("foo.xlsx", sheet_name="sheet1")

In [100]:
pd.read_excel("foo.xlsx", "sheet1", index_col=None, na_values=["NA"])

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,2000-01-01,1.505938,1.297101,-1.799368,-0.787923
1,2000-01-02,2.909821,2.344628,-1.242435,0.200375
2,2000-01-03,4.446432,1.140608,-1.195935,0.678672
3,2000-01-04,4.391076,1.205213,-0.712040,1.803608
4,2000-01-05,6.218261,1.187357,-1.269336,0.514750
...,...,...,...,...,...
995,2002-09-22,-18.132549,-55.012413,-52.945863,-20.105767
996,2002-09-23,-17.289997,-57.129872,-52.151727,-19.008509
997,2002-09-24,-16.640168,-56.032190,-51.750950,-19.587799
998,2002-09-25,-16.568729,-55.249155,-51.775747,-18.506953


In [6]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    2.391602
b   -1.915499
c    0.898386
d   -0.807245
e    0.223235
dtype: float64

In [7]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [103]:
pd.Series(np.random.randn(5))

0   -0.200483
1    0.997108
2   -0.312030
3    0.344857
4   -1.054807
dtype: float64

In [2]:
d = {"b":1, "a":0, "c":2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [3]:
d = {"a":0.0, "b":1.0, "c":2.0}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [4]:
pd.Series(d, index=["b", "c", "d", "a"])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [5]:
pd.Series(5.0, index=["a", "b", "c", "d", "e"])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [8]:
s[0]

2.3916020590109497

In [9]:
s[:3]

a    2.391602
b   -1.915499
c    0.898386
dtype: float64

In [10]:
s[s > s.median()]

a    2.391602
c    0.898386
dtype: float64

In [11]:
s[[4, 3, 1]]

e    0.223235
d   -0.807245
b   -1.915499
dtype: float64

In [12]:
np.exp(s)

a    10.930992
b     0.147268
c     2.455638
d     0.446085
e     1.250115
dtype: float64

In [14]:
s.dtype

dtype('float64')

In [15]:
s.array

<PandasArray>
[ 2.3916020590109497, -1.9154988874598218,  0.8983864917059566,
  -0.807245317582255, 0.22323533087162073]
Length: 5, dtype: float64

In [16]:
s.to_numpy()

array([ 2.39160206, -1.91549889,  0.89838649, -0.80724532,  0.22323533])