In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [4]:
dates = pd.date_range("20130101", periods = 6)
print(dates)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
print(df)

                   A         B         C         D
2013-01-01 -1.649703 -1.071464  0.381759  1.171527
2013-01-02 -0.312421 -0.382980  0.277513 -1.126659
2013-01-03 -0.442370 -0.165818  0.043613  1.135830
2013-01-04  0.142984  0.573742  0.127119  0.654577
2013-01-05 -2.122763  0.478408 -2.424376 -0.607679
2013-01-06 -0.357333  0.711387  0.699599 -1.000434


In [8]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype = "int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
print(df2)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


In [9]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [16]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [18]:
df.to_numpy() # cheap since everything is a float64
# notice that we don't have the index or column labels

array([[-1.64970277, -1.07146383,  0.3817586 ,  1.17152711],
       [-0.31242085, -0.38297954,  0.27751348, -1.12665899],
       [-0.44237004, -0.16581848,  0.04361334,  1.13583004],
       [ 0.14298401,  0.57374192,  0.12711946,  0.65457684],
       [-2.12276349,  0.47840782, -2.42437626, -0.60767945],
       [-0.35733316,  0.71138666,  0.6995995 , -1.00043371]])

In [19]:
df2.to_numpy() # expensive since multiple dtypes => everything has to be cast to base Python object

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [20]:
df.describe() #shows quick summary of the data

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.790268,0.023879,-0.149129,0.03786
std,0.885656,0.690349,1.13793,1.069762
min,-2.122763,-1.071464,-2.424376,-1.126659
25%,-1.34787,-0.328689,0.06449,-0.902245
50%,-0.399852,0.156295,0.202316,0.023449
75%,-0.323649,0.549908,0.355697,1.015517
max,0.142984,0.711387,0.699599,1.171527


In [21]:
df.T # quick transpose of the data

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.649703,-0.312421,-0.44237,0.142984,-2.122763,-0.357333
B,-1.071464,-0.38298,-0.165818,0.573742,0.478408,0.711387
C,0.381759,0.277513,0.043613,0.127119,-2.424376,0.699599
D,1.171527,-1.126659,1.13583,0.654577,-0.607679,-1.000434


In [22]:
df.sort_index(axis=1, ascending = False) # sort by an index, in this case column names

Unnamed: 0,D,C,B,A
2013-01-01,1.171527,0.381759,-1.071464,-1.649703
2013-01-02,-1.126659,0.277513,-0.38298,-0.312421
2013-01-03,1.13583,0.043613,-0.165818,-0.44237
2013-01-04,0.654577,0.127119,0.573742,0.142984
2013-01-05,-0.607679,-2.424376,0.478408,-2.122763
2013-01-06,-1.000434,0.699599,0.711387,-0.357333


In [23]:
df.sort_values(by = "B") # sort by the values in a variable/column

Unnamed: 0,A,B,C,D
2013-01-01,-1.649703,-1.071464,0.381759,1.171527
2013-01-02,-0.312421,-0.38298,0.277513,-1.126659
2013-01-03,-0.44237,-0.165818,0.043613,1.13583
2013-01-05,-2.122763,0.478408,-2.424376,-0.607679
2013-01-04,0.142984,0.573742,0.127119,0.654577
2013-01-06,-0.357333,0.711387,0.699599,-1.000434


## indexing

In [24]:
df["A"]

2013-01-01   -1.649703
2013-01-02   -0.312421
2013-01-03   -0.442370
2013-01-04    0.142984
2013-01-05   -2.122763
2013-01-06   -0.357333
Freq: D, Name: A, dtype: float64

In [25]:
df.A # equivalent to above

2013-01-01   -1.649703
2013-01-02   -0.312421
2013-01-03   -0.442370
2013-01-04    0.142984
2013-01-05   -2.122763
2013-01-06   -0.357333
Freq: D, Name: A, dtype: float64

In [26]:
df[0:3] # slices rows

Unnamed: 0,A,B,C,D
2013-01-01,-1.649703,-1.071464,0.381759,1.171527
2013-01-02,-0.312421,-0.38298,0.277513,-1.126659
2013-01-03,-0.44237,-0.165818,0.043613,1.13583


### Indexing by label

In [27]:
df.loc[dates[0]]

A   -1.649703
B   -1.071464
C    0.381759
D    1.171527
Name: 2013-01-01 00:00:00, dtype: float64

In [28]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-1.649703,-1.071464
2013-01-02,-0.312421,-0.38298
2013-01-03,-0.44237,-0.165818
2013-01-04,0.142984,0.573742
2013-01-05,-2.122763,0.478408
2013-01-06,-0.357333,0.711387


In [29]:
df.loc["20130102":"20130104", ["A","B"]]

Unnamed: 0,A,B
2013-01-02,-0.312421,-0.38298
2013-01-03,-0.44237,-0.165818
2013-01-04,0.142984,0.573742


In [30]:
df.loc["20130102", ["A", "B"]]

A   -0.312421
B   -0.382980
Name: 2013-01-02 00:00:00, dtype: float64

In [31]:
df.loc[dates[0], "A"]

-1.6497027677420841

In [32]:
df.at[dates[0], "A"] #fast access to a scalar, equivalent to above

-1.6497027677420841

### Indexing by position

In [33]:
df.iloc[3]

A    0.142984
B    0.573742
C    0.127119
D    0.654577
Name: 2013-01-04 00:00:00, dtype: float64

In [34]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.142984,0.573742
2013-01-05,-2.122763,0.478408


In [35]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.312421,0.277513
2013-01-03,-0.44237,0.043613
2013-01-05,-2.122763,-2.424376


In [36]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-0.312421,-0.38298,0.277513,-1.126659
2013-01-03,-0.44237,-0.165818,0.043613,1.13583


In [37]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-1.071464,0.381759
2013-01-02,-0.38298,0.277513
2013-01-03,-0.165818,0.043613
2013-01-04,0.573742,0.127119
2013-01-05,0.478408,-2.424376
2013-01-06,0.711387,0.699599


In [38]:
df.iat[1, 1]

-0.38297953902865245

### Boolean indexing