In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


# Pandas tutorial (DAy-11)
This notebook explains ...

# pip install pandas
# pip install numpy

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Object creation
s = pd.Series([1,3,np.nan,5,7,9])
s

0    1.0
1    3.0
2    NaN
3    5.0
4    7.0
5    9.0
dtype: float64

In [4]:
dates = pd.date_range("20220101", periods=20)
dates

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
               '2022-01-13', '2022-01-14', '2022-01-15', '2022-01-16',
               '2022-01-17', '2022-01-18', '2022-01-19', '2022-01-20'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(20,4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2022-01-01,-3.186186,0.601223,2.22684,0.443622
2022-01-02,1.017778,-0.436603,-0.669793,0.773942
2022-01-03,-0.521203,-0.074159,0.114211,0.423028
2022-01-04,-0.652292,-0.489993,-2.123533,1.25904
2022-01-05,1.08871,0.047841,-2.275868,-0.883218
2022-01-06,0.198327,-0.220824,0.522936,-1.259306
2022-01-07,1.547514,1.229059,0.236729,0.104877
2022-01-08,0.668536,0.441183,-0.274947,-1.565853
2022-01-09,0.508978,-0.749137,-0.643714,-1.501613
2022-01-10,0.798615,0.259276,-0.581902,2.809249


In [6]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130111"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "E": np.array([3] *4, dtype="int32"),
        "F": "females",
    }
)
df2
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
E             int32
F            object
dtype: object

In [7]:
df.head(4)

Unnamed: 0,A,B,C,D
2022-01-01,-3.186186,0.601223,2.22684,0.443622
2022-01-02,1.017778,-0.436603,-0.669793,0.773942
2022-01-03,-0.521203,-0.074159,0.114211,0.423028
2022-01-04,-0.652292,-0.489993,-2.123533,1.25904


In [8]:
df2.tail(2)

Unnamed: 0,A,B,C,E,F
2,1.0,2013-01-11,1.0,3,females
3,1.0,2013-01-11,1.0,3,females


In [9]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [10]:
df.to_numpy()

array([[-3.18618571,  0.60122272,  2.22683965,  0.44362161],
       [ 1.01777848, -0.43660311, -0.66979275,  0.77394228],
       [-0.52120332, -0.0741593 ,  0.11421125,  0.42302823],
       [-0.65229231, -0.48999298, -2.123533  ,  1.25904037],
       [ 1.08870962,  0.04784108, -2.27586801, -0.88321761],
       [ 0.19832679, -0.22082366,  0.52293598, -1.25930618],
       [ 1.54751414,  1.22905852,  0.23672945,  0.10487749],
       [ 0.66853559,  0.44118291, -0.27494705, -1.56585349],
       [ 0.50897807, -0.74913738, -0.64371403, -1.50161287],
       [ 0.79861499,  0.25927564, -0.58190166,  2.80924894],
       [ 0.02975419, -0.78339958,  0.76742249, -0.82890189],
       [-1.13302084, -1.35152824,  1.47500809,  0.09817935],
       [ 0.7439884 ,  2.06599647, -0.18457252, -0.31562511],
       [-0.1143743 , -1.05916222,  0.89519666,  0.36475463],
       [-1.7972071 ,  1.36893825,  0.74620354,  0.26544022],
       [-0.87066097,  0.55681007, -0.53425488, -0.41341299],
       [ 0.99662001, -0.

In [11]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-11 00:00:00'), 1.0, 3, 'females'],
       [1.0, Timestamp('2013-01-11 00:00:00'), 1.0, 3, 'females'],
       [1.0, Timestamp('2013-01-11 00:00:00'), 1.0, 3, 'females'],
       [1.0, Timestamp('2013-01-11 00:00:00'), 1.0, 3, 'females']],
      dtype=object)

In [12]:
df.describe()

Unnamed: 0,A,B,C,D
count,20.0,20.0,20.0,20.0
mean,-0.108395,-0.051263,-0.117974,0.052904
std,1.14185,1.109928,1.091074,1.026684
min,-3.186186,-2.304899,-2.275868,-1.565853
25%,-0.706884,-0.757703,-0.650234,-0.558569
50%,0.034803,-0.147491,-0.22976,0.185159
75%,0.757645,0.567913,0.578753,0.441728
max,1.547514,2.065996,2.22684,2.809249


In [13]:
df2.T # Transpose data

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2013-01-11 00:00:00,2013-01-11 00:00:00,2013-01-11 00:00:00,2013-01-11 00:00:00
C,1.0,1.0,1.0,1.0
E,3,3,3,3
F,females,females,females,females


In [14]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-01-01,0.443622,2.22684,0.601223,-3.186186
2022-01-02,0.773942,-0.669793,-0.436603,1.017778
2022-01-03,0.423028,0.114211,-0.074159,-0.521203
2022-01-04,1.25904,-2.123533,-0.489993,-0.652292
2022-01-05,-0.883218,-2.275868,0.047841,1.08871
2022-01-06,-1.259306,0.522936,-0.220824,0.198327
2022-01-07,0.104877,0.236729,1.229059,1.547514
2022-01-08,-1.565853,-0.274947,0.441183,0.668536
2022-01-09,-1.501613,-0.643714,-0.749137,0.508978
2022-01-10,2.809249,-0.581902,0.259276,0.798615


In [15]:
df.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D
2022-01-01,-3.186186,0.601223,2.22684,0.443622
2022-01-02,1.017778,-0.436603,-0.669793,0.773942
2022-01-03,-0.521203,-0.074159,0.114211,0.423028
2022-01-04,-0.652292,-0.489993,-2.123533,1.25904
2022-01-05,1.08871,0.047841,-2.275868,-0.883218
2022-01-06,0.198327,-0.220824,0.522936,-1.259306
2022-01-07,1.547514,1.229059,0.236729,0.104877
2022-01-08,0.668536,0.441183,-0.274947,-1.565853
2022-01-09,0.508978,-0.749137,-0.643714,-1.501613
2022-01-10,0.798615,0.259276,-0.581902,2.809249


In [16]:
#df.sort_index(axis=0, ascending=True)
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-01-20,0.039852,-1.158289,-0.946224,0.883195
2022-01-19,-1.042034,1.756233,-0.603572,0.441097
2022-01-18,-0.489586,-2.304899,-0.885075,0.428048
2022-01-17,0.99662,-0.723834,0.379426,-0.468458
2022-01-16,-0.870661,0.55681,-0.534255,-0.413413
2022-01-15,-1.797207,1.368938,0.746204,0.26544
2022-01-14,-0.114374,-1.059162,0.895197,0.364755
2022-01-13,0.743988,2.065996,-0.184573,-0.315625
2022-01-12,-1.133021,-1.351528,1.475008,0.098179
2022-01-11,0.029754,-0.7834,0.767422,-0.828902


In [17]:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2022-01-13,0.743988,2.065996,-0.184573,-0.315625
2022-01-19,-1.042034,1.756233,-0.603572,0.441097
2022-01-15,-1.797207,1.368938,0.746204,0.26544
2022-01-07,1.547514,1.229059,0.236729,0.104877
2022-01-01,-3.186186,0.601223,2.22684,0.443622
2022-01-16,-0.870661,0.55681,-0.534255,-0.413413
2022-01-08,0.668536,0.441183,-0.274947,-1.565853
2022-01-10,0.798615,0.259276,-0.581902,2.809249
2022-01-05,1.08871,0.047841,-2.275868,-0.883218
2022-01-03,-0.521203,-0.074159,0.114211,0.423028


In [18]:
df["A"]

2022-01-01   -3.186186
2022-01-02    1.017778
2022-01-03   -0.521203
2022-01-04   -0.652292
2022-01-05    1.088710
2022-01-06    0.198327
2022-01-07    1.547514
2022-01-08    0.668536
2022-01-09    0.508978
2022-01-10    0.798615
2022-01-11    0.029754
2022-01-12   -1.133021
2022-01-13    0.743988
2022-01-14   -0.114374
2022-01-15   -1.797207
2022-01-16   -0.870661
2022-01-17    0.996620
2022-01-18   -0.489586
2022-01-19   -1.042034
2022-01-20    0.039852
Freq: D, Name: A, dtype: float64

In [19]:
df["B"]

2022-01-01    0.601223
2022-01-02   -0.436603
2022-01-03   -0.074159
2022-01-04   -0.489993
2022-01-05    0.047841
2022-01-06   -0.220824
2022-01-07    1.229059
2022-01-08    0.441183
2022-01-09   -0.749137
2022-01-10    0.259276
2022-01-11   -0.783400
2022-01-12   -1.351528
2022-01-13    2.065996
2022-01-14   -1.059162
2022-01-15    1.368938
2022-01-16    0.556810
2022-01-17   -0.723834
2022-01-18   -2.304899
2022-01-19    1.756233
2022-01-20   -1.158289
Freq: D, Name: B, dtype: float64

In [20]:
df[0:2] # row wise selection

Unnamed: 0,A,B,C,D
2022-01-01,-3.186186,0.601223,2.22684,0.443622
2022-01-02,1.017778,-0.436603,-0.669793,0.773942


In [21]:
df[0:]

Unnamed: 0,A,B,C,D
2022-01-01,-3.186186,0.601223,2.22684,0.443622
2022-01-02,1.017778,-0.436603,-0.669793,0.773942
2022-01-03,-0.521203,-0.074159,0.114211,0.423028
2022-01-04,-0.652292,-0.489993,-2.123533,1.25904
2022-01-05,1.08871,0.047841,-2.275868,-0.883218
2022-01-06,0.198327,-0.220824,0.522936,-1.259306
2022-01-07,1.547514,1.229059,0.236729,0.104877
2022-01-08,0.668536,0.441183,-0.274947,-1.565853
2022-01-09,0.508978,-0.749137,-0.643714,-1.501613
2022-01-10,0.798615,0.259276,-0.581902,2.809249


In [22]:
df.loc[dates[0]]

A   -3.186186
B    0.601223
C    2.226840
D    0.443622
Name: 2022-01-01 00:00:00, dtype: float64

In [23]:
df.loc[:,["A", "B"]]

Unnamed: 0,A,B
2022-01-01,-3.186186,0.601223
2022-01-02,1.017778,-0.436603
2022-01-03,-0.521203,-0.074159
2022-01-04,-0.652292,-0.489993
2022-01-05,1.08871,0.047841
2022-01-06,0.198327,-0.220824
2022-01-07,1.547514,1.229059
2022-01-08,0.668536,0.441183
2022-01-09,0.508978,-0.749137
2022-01-10,0.798615,0.259276


In [24]:
df.loc["20220102":"20220104", ["A", "B", "C"]]

Unnamed: 0,A,B,C
2022-01-02,1.017778,-0.436603,-0.669793
2022-01-03,-0.521203,-0.074159,0.114211
2022-01-04,-0.652292,-0.489993,-2.123533


In [25]:
df.loc["20220102":"20220104", ["A", "B"]]

Unnamed: 0,A,B
2022-01-02,1.017778,-0.436603
2022-01-03,-0.521203,-0.074159
2022-01-04,-0.652292,-0.489993


In [26]:
df.loc[["20220102","20220104"], ["A", "B", "C"]]

Unnamed: 0,A,B,C
2022-01-02,1.017778,-0.436603,-0.669793
2022-01-04,-0.652292,-0.489993,-2.123533


In [27]:
df.loc["20220111", ["A", "B"]]

A    0.029754
B   -0.783400
Name: 2022-01-11 00:00:00, dtype: float64

In [28]:
df.at[dates[0], "A"]

-3.186185710965406

In [29]:
df.iloc[3]

A   -0.652292
B   -0.489993
C   -2.123533
D    1.259040
Name: 2022-01-04 00:00:00, dtype: float64

In [30]:
df.iloc[3:10]

Unnamed: 0,A,B,C,D
2022-01-04,-0.652292,-0.489993,-2.123533,1.25904
2022-01-05,1.08871,0.047841,-2.275868,-0.883218
2022-01-06,0.198327,-0.220824,0.522936,-1.259306
2022-01-07,1.547514,1.229059,0.236729,0.104877
2022-01-08,0.668536,0.441183,-0.274947,-1.565853
2022-01-09,0.508978,-0.749137,-0.643714,-1.501613
2022-01-10,0.798615,0.259276,-0.581902,2.809249


In [31]:
df.iloc[0:5, 0:3] # left side row right side column

Unnamed: 0,A,B,C
2022-01-01,-3.186186,0.601223,2.22684
2022-01-02,1.017778,-0.436603,-0.669793
2022-01-03,-0.521203,-0.074159,0.114211
2022-01-04,-0.652292,-0.489993,-2.123533
2022-01-05,1.08871,0.047841,-2.275868


In [32]:
df.iloc[0:5,] # All columns

Unnamed: 0,A,B,C,D
2022-01-01,-3.186186,0.601223,2.22684,0.443622
2022-01-02,1.017778,-0.436603,-0.669793,0.773942
2022-01-03,-0.521203,-0.074159,0.114211,0.423028
2022-01-04,-0.652292,-0.489993,-2.123533,1.25904
2022-01-05,1.08871,0.047841,-2.275868,-0.883218


In [40]:
df.iloc[:, 0:2]

Unnamed: 0,A,B
2022-01-01,-3.186186,0.601223
2022-01-02,1.017778,-0.436603
2022-01-03,-0.521203,-0.074159
2022-01-04,-0.652292,-0.489993
2022-01-05,1.08871,0.047841
2022-01-06,0.198327,-0.220824
2022-01-07,1.547514,1.229059
2022-01-08,0.668536,0.441183
2022-01-09,0.508978,-0.749137
2022-01-10,0.798615,0.259276


In [34]:
df[df["A"]>0]

Unnamed: 0,A,B,C,D
2022-01-02,1.017778,-0.436603,-0.669793,0.773942
2022-01-05,1.08871,0.047841,-2.275868,-0.883218
2022-01-06,0.198327,-0.220824,0.522936,-1.259306
2022-01-07,1.547514,1.229059,0.236729,0.104877
2022-01-08,0.668536,0.441183,-0.274947,-1.565853
2022-01-09,0.508978,-0.749137,-0.643714,-1.501613
2022-01-10,0.798615,0.259276,-0.581902,2.809249
2022-01-11,0.029754,-0.7834,0.767422,-0.828902
2022-01-13,0.743988,2.065996,-0.184573,-0.315625
2022-01-17,0.99662,-0.723834,0.379426,-0.468458


In [35]:
df[df>0]

Unnamed: 0,A,B,C,D
2022-01-01,,0.601223,2.22684,0.443622
2022-01-02,1.017778,,,0.773942
2022-01-03,,,0.114211,0.423028
2022-01-04,,,,1.25904
2022-01-05,1.08871,0.047841,,
2022-01-06,0.198327,,0.522936,
2022-01-07,1.547514,1.229059,0.236729,0.104877
2022-01-08,0.668536,0.441183,,
2022-01-09,0.508978,,,
2022-01-10,0.798615,0.259276,,2.809249


In [36]:
df3 =df.copy()
df3

Unnamed: 0,A,B,C,D
2022-01-01,-3.186186,0.601223,2.22684,0.443622
2022-01-02,1.017778,-0.436603,-0.669793,0.773942
2022-01-03,-0.521203,-0.074159,0.114211,0.423028
2022-01-04,-0.652292,-0.489993,-2.123533,1.25904
2022-01-05,1.08871,0.047841,-2.275868,-0.883218
2022-01-06,0.198327,-0.220824,0.522936,-1.259306
2022-01-07,1.547514,1.229059,0.236729,0.104877
2022-01-08,0.668536,0.441183,-0.274947,-1.565853
2022-01-09,0.508978,-0.749137,-0.643714,-1.501613
2022-01-10,0.798615,0.259276,-0.581902,2.809249


In [37]:
df3["Adding another colunm"] = ["one", "one", "two", "three", "four", "three","one", "one", "two", "three", "four", "three","one", "one", "two", "three", "four", "three","four", "three"]
df3

Unnamed: 0,A,B,C,D,Adding another colunm
2022-01-01,-3.186186,0.601223,2.22684,0.443622,one
2022-01-02,1.017778,-0.436603,-0.669793,0.773942,one
2022-01-03,-0.521203,-0.074159,0.114211,0.423028,two
2022-01-04,-0.652292,-0.489993,-2.123533,1.25904,three
2022-01-05,1.08871,0.047841,-2.275868,-0.883218,four
2022-01-06,0.198327,-0.220824,0.522936,-1.259306,three
2022-01-07,1.547514,1.229059,0.236729,0.104877,one
2022-01-08,0.668536,0.441183,-0.274947,-1.565853,one
2022-01-09,0.508978,-0.749137,-0.643714,-1.501613,two
2022-01-10,0.798615,0.259276,-0.581902,2.809249,three


In [38]:
#mean"]=[df3["B"]+1]
df3["new"] = [1,3,5,6,7,1,3,5,6,7,1,3,5,6,7,1,3,5,6,7]
df3
df3 = df3.iloc[:,0:4]
df3

Unnamed: 0,A,B,C,D
2022-01-01,-3.186186,0.601223,2.22684,0.443622
2022-01-02,1.017778,-0.436603,-0.669793,0.773942
2022-01-03,-0.521203,-0.074159,0.114211,0.423028
2022-01-04,-0.652292,-0.489993,-2.123533,1.25904
2022-01-05,1.08871,0.047841,-2.275868,-0.883218
2022-01-06,0.198327,-0.220824,0.522936,-1.259306
2022-01-07,1.547514,1.229059,0.236729,0.104877
2022-01-08,0.668536,0.441183,-0.274947,-1.565853
2022-01-09,0.508978,-0.749137,-0.643714,-1.501613
2022-01-10,0.798615,0.259276,-0.581902,2.809249


# Assignment
- Add mean value