# 10 Minutes to Pandas

*Working through some of the 10 minutes to pandas examples at:*
https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

In [1]:
import pandas as pd
import numpy as np


# Object Creation

In [3]:
df = pd.DataFrame({
                    'Col1' : ['A','B','C'],
                    'Col2' : [1,2,3],
                    'Col3' : [2,4,6],
                    'Col4' : [True, False, True]

                  })
df

Unnamed: 0,Col1,Col2,Col3,Col4
0,A,1,2,True
1,B,2,4,False
2,C,3,6,True


In [10]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.203114,-0.245909,-0.517315,-0.351319
2013-01-02,-0.102385,-0.641654,-0.924476,1.435857
2013-01-03,-0.869372,2.356969,-0.259114,-0.120966
2013-01-04,-0.253256,-0.096927,-1.208043,0.230645
2013-01-05,0.988512,-1.719463,-0.246141,-0.578608
2013-01-06,1.004663,-0.669362,-1.753058,-0.291354


In [13]:
df2 = pd.DataFrame(
     {
         "A": 1.0,
         "B": pd.Timestamp("20130102"),
         "C": pd.Series(1, index=list(range(4)), dtype="float32"),
         "D": np.array([3] * 4, dtype="int32"),
         "E": pd.Categorical(["test", "train", "test", "train"]),
         "F": "foo",
     }
 )
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [15]:
 df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing Data

In [18]:
print(df.index)
print(df.columns)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [20]:
df.to_numpy()

array([[-0.20311384, -0.24590897, -0.51731479, -0.3513185 ],
       [-0.10238503, -0.64165362, -0.92447611,  1.43585749],
       [-0.86937229,  2.35696867, -0.25911445, -0.12096572],
       [-0.25325603, -0.09692742, -1.20804256,  0.23064532],
       [ 0.98851171, -1.71946275, -0.2461405 , -0.57860812],
       [ 1.00466323, -0.66936227, -1.7530578 , -0.29135435]])

In [23]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [38]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.094175,-0.169391,-0.818024,0.054043
std,0.748996,1.361567,0.594574,0.72871
min,-0.869372,-1.719463,-1.753058,-0.578608
25%,-0.24072,-0.662435,-1.137151,-0.336327
50%,-0.152749,-0.443781,-0.720895,-0.20616
75%,0.715788,-0.134173,-0.323665,0.142743
max,1.004663,2.356969,-0.246141,1.435857


In [43]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.203114,-0.102385,-0.869372,-0.253256,0.988512,1.004663
B,-0.245909,-0.641654,2.356969,-0.096927,-1.719463,-0.669362
C,-0.517315,-0.924476,-0.259114,-1.208043,-0.246141,-1.753058
D,-0.351319,1.435857,-0.120966,0.230645,-0.578608,-0.291354


In [47]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.351319,-0.517315,-0.245909,-0.203114
2013-01-02,1.435857,-0.924476,-0.641654,-0.102385
2013-01-03,-0.120966,-0.259114,2.356969,-0.869372
2013-01-04,0.230645,-1.208043,-0.096927,-0.253256
2013-01-05,-0.578608,-0.246141,-1.719463,0.988512
2013-01-06,-0.291354,-1.753058,-0.669362,1.004663


In [52]:
df.sort_values(by = 'B')

Unnamed: 0,A,B,C,D
2013-01-05,0.988512,-1.719463,-0.246141,-0.578608
2013-01-06,1.004663,-0.669362,-1.753058,-0.291354
2013-01-02,-0.102385,-0.641654,-0.924476,1.435857
2013-01-01,-0.203114,-0.245909,-0.517315,-0.351319
2013-01-04,-0.253256,-0.096927,-1.208043,0.230645
2013-01-03,-0.869372,2.356969,-0.259114,-0.120966


# Selection

**While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods:**  .at, .iat, .loc and .iloc.


## Getting

In [53]:
df["A"]

2013-01-01   -0.203114
2013-01-02   -0.102385
2013-01-03   -0.869372
2013-01-04   -0.253256
2013-01-05    0.988512
2013-01-06    1.004663
Freq: D, Name: A, dtype: float64

In [54]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.203114,-0.245909,-0.517315,-0.351319
2013-01-02,-0.102385,-0.641654,-0.924476,1.435857
2013-01-03,-0.869372,2.356969,-0.259114,-0.120966


In [55]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.102385,-0.641654,-0.924476,1.435857
2013-01-03,-0.869372,2.356969,-0.259114,-0.120966
2013-01-04,-0.253256,-0.096927,-1.208043,0.230645


## Selection by label

*df.loc[row_indexer,column_indexer]*

For getting a cross section using a label:

In [81]:
print(dates[0])

df.loc[dates[0]]

2013-01-01 00:00:00


A   -0.203114
B   -0.245909
C   -0.517315
D   -0.351319
Name: 2013-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [82]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.203114,-0.245909
2013-01-02,-0.102385,-0.641654
2013-01-03,-0.869372,2.356969
2013-01-04,-0.253256,-0.096927
2013-01-05,0.988512,-1.719463
2013-01-06,1.004663,-0.669362


## Selection by Position

*df.iloc[row_indexer, column_indexer]*

In [83]:
df.iloc[3]

A   -0.253256
B   -0.096927
C   -1.208043
D    0.230645
Name: 2013-01-04 00:00:00, dtype: float64

By integer slices, acting similar to numpy/Python (lower bound included, upper bound excluded):

In [91]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.253256,-0.096927
2013-01-05,0.988512,-1.719463


## Boolean indexing

In [94]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-05,0.988512,-1.719463,-0.246141,-0.578608
2013-01-06,1.004663,-0.669362,-1.753058,-0.291354


Selecting values from a DataFrame where a boolean condition is met.

In [99]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,1.435857
2013-01-03,,2.356969,,
2013-01-04,,,,0.230645
2013-01-05,0.988512,,,
2013-01-06,1.004663,,,


Using the isin() method for filtering:

In [100]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.203114,-0.245909,-0.517315,-0.351319,one
2013-01-02,-0.102385,-0.641654,-0.924476,1.435857,one
2013-01-03,-0.869372,2.356969,-0.259114,-0.120966,two
2013-01-04,-0.253256,-0.096927,-1.208043,0.230645,three
2013-01-05,0.988512,-1.719463,-0.246141,-0.578608,four
2013-01-06,1.004663,-0.669362,-1.753058,-0.291354,three


In a list:

In [101]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.869372,2.356969,-0.259114,-0.120966,two
2013-01-05,0.988512,-1.719463,-0.246141,-0.578608,four


Or a series:

In [106]:
s1 = pd.Series(["two","four"])
df2[df2["E"].isin(s1)]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.869372,2.356969,-0.259114,-0.120966,two
2013-01-05,0.988512,-1.719463,-0.246141,-0.578608,four


## Setting

Setting a new column automatically aligns the data by the indexes.

In [130]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
print(s1)
df["F"] = s1
df

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64


Unnamed: 0,A,B,C,D,F
2013-01-01,-0.203114,-0.245909,-0.517315,-0.351319,
2013-01-02,-0.102385,-0.641654,-0.924476,1.435857,1.0
2013-01-03,-0.869372,2.356969,-0.259114,-0.120966,2.0
2013-01-04,-0.253256,-0.096927,-1.208043,0.230645,3.0
2013-01-05,0.988512,-1.719463,-0.246141,-0.578608,4.0
2013-01-06,1.004663,-0.669362,-1.753058,-0.291354,5.0


Setting values by label:

In [137]:
df.at[dates[0],"A"] = 0

Setting values by position:

In [138]:
df.iat[0, 1] = 0

Setting by assigning with a NumPy array:

In [139]:
df.loc[:, "D"] = np.array([5] * len(df))

Results:

In [144]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.517315,5,
2013-01-02,-0.102385,-0.641654,-0.924476,5,1.0
2013-01-03,-0.869372,2.356969,-0.259114,5,2.0
2013-01-04,-0.253256,-0.096927,-1.208043,5,3.0
2013-01-05,0.988512,-1.719463,-0.246141,5,4.0
2013-01-06,1.004663,-0.669362,-1.753058,5,5.0


A `where` operation with setting.

In [145]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.517315,-5,
2013-01-02,-0.102385,-0.641654,-0.924476,-5,-1.0
2013-01-03,-0.869372,-2.356969,-0.259114,-5,-2.0
2013-01-04,-0.253256,-0.096927,-1.208043,-5,-3.0
2013-01-05,-0.988512,-1.719463,-0.246141,-5,-4.0
2013-01-06,-1.004663,-0.669362,-1.753058,-5,-5.0


## Missing data