# Pandas

## Features missing in NumPy
<br>
-- There is no way to attach labels to data. Data['10th marks']
<br>
-> No pre built method to fill missing values.
<br>
-> No way to group data.
<br>
-> No way to pivot data.

Pandas is built on top of NumPy to make data processing on relational data easier.
<br>
Ingesting, Storing, Pre-Processing, Summarising and visulalising data can be all done efficiently with Pandas.
<br><br>
Pandas Object: Series and Dataframes

In [2]:
import pandas as pd
import numpy as np

## Creating Series Object

In [8]:
s = pd.Series([0,1,1,2,3,5,8]) # all values must be of same data type
s  # Series with auto-created index
s2 = pd.Series([0.1,1,2,3,5,8])
print(s)
print(s2)

0    0
1    1
2    1
3    2
4    3
5    5
6    8
dtype: int64
0    0.1
1    1.0
2    2.0
3    3.0
4    5.0
5    8.0
dtype: float64


In [10]:
s.values

array([0, 1, 1, 2, 3, 5, 8], dtype=int64)

In [11]:
s.index

RangeIndex(start=0, stop=7, step=1)

In [12]:
for i in s.values:
    print(i)
for i in s.index:
    print(i)

0
1
1
2
3
5
8
0
1
2
3
4
5
6


In [13]:
for item in zip(s.index,s.values):
    print(item)

(0, 0)
(1, 1)
(2, 1)
(3, 2)
(4, 3)
(5, 5)
(6, 8)


In [14]:
s[2]

1

In [17]:
mars = pd.Series([0.33,57.9,4222.6], index=['Mass','Diameter','Day length'])
mars

Mass             0.33
Diameter        57.90
Day length    4222.60
dtype: float64

In [19]:
mars['Mass']

0.33

In [21]:
mars.Mass # Not recommended

0.33

In [22]:
arr = np.random.randint(0,10,10)

In [23]:
arr

array([9, 9, 0, 6, 6, 9, 1, 0, 0, 2])

In [26]:
rand_series = pd.Series(arr, index = np.arange(10,20)) # Series from a numpy array
rand_series

10    9
11    9
12    0
13    6
14    6
15    9
16    1
17    0
18    0
19    2
dtype: int32

In [27]:
d = {}
d['mass'] = 0.33
d['diameter'] = 57.9
d['daylength'] = 4222.60
d

{'mass': 0.33, 'diameter': 57.9, 'daylength': 4222.6}

In [28]:
mars = pd.Series(d)

In [30]:
mars # series from a dictionary

mass            0.33
diameter       57.90
daylength    4222.60
dtype: float64

In [31]:
mars = pd.Series(d, index = ['mass','diameter'])

In [33]:
mars # custom indexing with dictionaries

mass         0.33
diameter    57.90
dtype: float64

## iloc & loc

In [38]:
s = pd.Series([0.1,1,1,2,3,5,8], index=[1,2,3,4,5,6,7]) # Explicit indexing
s

1    0.1
2    1.0
3    1.0
4    2.0
5    3.0
6    5.0
7    8.0
dtype: float64

In [39]:
s.loc[4] # Value at Explicit location 4

2.0

In [40]:
s.iloc[4] # Value at implicit location 4 (index value)

3.0

In [43]:
mars.loc['mass']

0.33

In [44]:
mars.iloc[0]

0.33

In [45]:
mars.iloc[-1] # Last value

57.9

In [46]:
mars.iloc[0:2]

mass         0.33
diameter    57.90
dtype: float64

In [49]:
mars.loc['mass':'diameter']

mass         0.33
diameter    57.90
dtype: float64

## operations on series

In [51]:
mass = pd.Series([0.33,4.87,5.97,0.642,1898,568,86.2,102,0.0146], index = ['Mercury','Venus','Earth','Mars','Jupiter','Saturn','Uranus','Neptune','Pluto'])

In [52]:
mass

Mercury       0.3300
Venus         4.8700
Earth         5.9700
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Uranus       86.2000
Neptune     102.0000
Pluto         0.0146
dtype: float64

In [53]:
mass > 100

Mercury    False
Venus      False
Earth      False
Mars       False
Jupiter     True
Saturn      True
Uranus     False
Neptune     True
Pluto      False
dtype: bool

In [54]:
mass[mass > 100]

Jupiter    1898.0
Saturn      568.0
Neptune     102.0
dtype: float64

In [55]:
mass[(mass > 100) & (mass < 600)]

Saturn     568.0
Neptune    102.0
dtype: float64

In [56]:
mass * 2

Mercury       0.6600
Venus         9.7400
Earth        11.9400
Mars          1.2840
Jupiter    3796.0000
Saturn     1136.0000
Uranus      172.4000
Neptune     204.0000
Pluto         0.0292
dtype: float64

In [57]:
np.mean(mass)

296.22517777777773

In [58]:
np.amin(mass)

0.0146

In [59]:
np.median(mass)

5.97

In [61]:
big_mass = mass[mass>100]
big_mass

Jupiter    1898.0
Saturn      568.0
Neptune     102.0
dtype: float64

In [63]:
new_mass = mass + big_mass
new_mass

Earth         NaN
Jupiter    3796.0
Mars          NaN
Mercury       NaN
Neptune     204.0
Pluto         NaN
Saturn     1136.0
Uranus        NaN
Venus         NaN
dtype: float64

In [65]:
pd.isnull(new_mass) # Check for missing values

Earth       True
Jupiter    False
Mars        True
Mercury     True
Neptune    False
Pluto       True
Saturn     False
Uranus      True
Venus       True
dtype: bool

In [66]:
mass['moon'] = 0.7346 # adding new values
mass

Mercury       0.3300
Venus         4.8700
Earth         5.9700
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Uranus       86.2000
Neptune     102.0000
Pluto         0.0146
moon          0.7346
dtype: float64

In [70]:
mass.drop(['Pluto']) # dropping values

Mercury       0.3300
Venus         4.8700
Earth         5.9700
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Uranus       86.2000
Neptune     102.0000
moon          0.7346
dtype: float64

In [71]:
diameter = pd.Series([4879,12104,12756,3475,6792,142984,120536,51118,49528], index = ['Mercury','Venus','Earth','Mars','Jupiter','Saturn','Uranus','Neptune','Pluto'])

In [72]:
mass = pd.Series([0.33,4.87,5.97,0.642,1898,568,86.2,102,0.0146], index = ['Mercury','Venus','Earth','Mars','Jupiter','Saturn','Uranus','Neptune','Pluto'])

In [73]:
density = pd.Series([])

In [75]:
for planet in mass.index:
    density[planet] = mass[planet] / ((np.pi * (diameter[planet])**3)/6)

In [76]:
density

Mercury    5.426538e-12
Venus      5.244977e-12
Earth      5.493286e-12
Mars       2.921945e-11
Jupiter    1.156923e-08
Saturn     3.710970e-13
Uranus     9.400659e-14
Neptune    1.458411e-12
Pluto      2.295101e-16
dtype: float64

In [79]:
density = mass / (np.pi * np.power(diameter,3)/6)
density

Mercury    5.426538e-12
Venus      5.244977e-12
Earth      5.493286e-12
Mars       2.921945e-11
Jupiter    1.156923e-08
Saturn     3.710970e-13
Uranus     9.400659e-14
Neptune    1.458411e-12
Pluto      2.295101e-16
dtype: float64

In [80]:
mass['Planetx'] = 6

In [81]:
density = mass / (np.pi * np.power(diameter,3)/6)
density

Earth      5.493286e-12
Jupiter    1.156923e-08
Mars       2.921945e-11
Mercury    5.426538e-12
Neptune    1.458411e-12
Planetx             NaN
Pluto      2.295101e-16
Saturn     3.710970e-13
Uranus     9.400659e-14
Venus      5.244977e-12
dtype: float64

In [85]:
pd.isnull(density)

Earth      False
Jupiter    False
Mars       False
Mercury    False
Neptune    False
Planetx     True
Pluto      False
Saturn     False
Uranus     False
Venus      False
dtype: bool

In [90]:
density.fillna(np.mean(density)) # fill missing values

Earth      5.493286e-12
Jupiter    1.156923e-08
Mars       2.921945e-11
Mercury    5.426538e-12
Neptune    1.458411e-12
Planetx    1.290727e-09
Pluto      2.295101e-16
Saturn     3.710970e-13
Uranus     9.400659e-14
Venus      5.244977e-12
dtype: float64

In [93]:
density[pd.isnull(density)] = np.mean(density) # different method

In [92]:
density

Earth      5.493286e-12
Jupiter    1.156923e-08
Mars       2.921945e-11
Mercury    5.426538e-12
Neptune    1.458411e-12
Planetx    1.290727e-09
Pluto      2.295101e-16
Saturn     3.710970e-13
Uranus     9.400659e-14
Venus      5.244977e-12
dtype: float64

## Case Study


In [155]:
nifty = pd.read_csv("nifty-200623-235920.csv", index_col = 0).iloc[:,0]

In [156]:
nifty.head(25)

Date
01-Jan-2019    10910.10
02-Jan-2019    10792.50
03-Jan-2019    10672.25
04-Jan-2019    10727.35
07-Jan-2019    10771.80
08-Jan-2019    10802.15
09-Jan-2019    10855.15
10-Jan-2019    10821.60
11-Jan-2019    10794.95
14-Jan-2019    10737.60
15-Jan-2019    10886.80
16-Jan-2019    10890.30
17-Jan-2019    10905.20
18-Jan-2019    10906.95
21-Jan-2019    10961.85
22-Jan-2019    10922.75
23-Jan-2019    10831.50
24-Jan-2019    10849.80
25-Jan-2019    10780.55
28-Jan-2019    10661.55
29-Jan-2019    10652.20
30-Jan-2019    10651.80
31-Jan-2019    10830.95
01-Feb-2019    10893.65
04-Feb-2019    10912.25
Name: Close, dtype: float64

In [157]:
nifty.tail(10)

Date
17-Dec-2019    12165.00
18-Dec-2019    12221.65
19-Dec-2019    12259.70
20-Dec-2019    12271.80
23-Dec-2019    12262.75
24-Dec-2019    12214.55
26-Dec-2019    12126.55
27-Dec-2019    12245.80
30-Dec-2019    12255.85
31-Dec-2019    12168.45
Name: Close, dtype: float64

In [158]:
np.mean(nifty)

11432.632244897959

What fraction of days did the market close higher than the previous day close?

In [159]:
nifty[1:]-nifty[:-1] # not the correct answer

Date
01-Apr-2019    0.0
01-Aug-2019    0.0
01-Feb-2019    0.0
01-Jan-2019    NaN
01-Jul-2019    0.0
01-Mar-2019    0.0
01-Nov-2019    0.0
01-Oct-2019    0.0
02-Apr-2019    0.0
02-Aug-2019    0.0
02-Dec-2019    0.0
02-Jan-2019    0.0
02-Jul-2019    0.0
02-May-2019    0.0
03-Apr-2019    0.0
03-Dec-2019    0.0
03-Jan-2019    0.0
03-Jul-2019    0.0
03-Jun-2019    0.0
03-May-2019    0.0
03-Oct-2019    0.0
03-Sep-2019    0.0
04-Apr-2019    0.0
04-Dec-2019    0.0
04-Feb-2019    0.0
04-Jan-2019    0.0
04-Jul-2019    0.0
04-Jun-2019    0.0
04-Nov-2019    0.0
04-Oct-2019    0.0
              ... 
27-Nov-2019    0.0
27-Oct-2019    0.0
27-Sep-2019    0.0
28-Aug-2019    0.0
28-Feb-2019    0.0
28-Jan-2019    0.0
28-Jun-2019    0.0
28-Mar-2019    0.0
28-May-2019    0.0
28-Nov-2019    0.0
29-Aug-2019    0.0
29-Jan-2019    0.0
29-Jul-2019    0.0
29-Mar-2019    0.0
29-May-2019    0.0
29-Nov-2019    0.0
29-Oct-2019    0.0
30-Apr-2019    0.0
30-Aug-2019    0.0
30-Dec-2019    0.0
30-Jan-2019    0.0
30-Jul-

In [160]:
nifty.values[1:]-nifty.values[:-1]

array([-1.1760e+02, -1.2025e+02,  5.5100e+01,  4.4450e+01,  3.0350e+01,
        5.3000e+01, -3.3550e+01, -2.6650e+01, -5.7350e+01,  1.4920e+02,
        3.5000e+00,  1.4900e+01,  1.7500e+00,  5.4900e+01, -3.9100e+01,
       -9.1250e+01,  1.8300e+01, -6.9250e+01, -1.1900e+02, -9.3500e+00,
       -4.0000e-01,  1.7915e+02,  6.2700e+01,  1.8600e+01,  2.2100e+01,
        1.2810e+02,  6.9500e+00, -1.2580e+02, -5.4800e+01, -5.7400e+01,
       -3.7750e+01, -4.7600e+01, -2.1650e+01, -8.3450e+01, -3.6600e+01,
        1.3110e+02,  5.4400e+01,  1.8000e+00,  8.8450e+01, -4.4800e+01,
       -2.8650e+01, -1.4150e+01,  7.1000e+01,  1.2395e+02,  6.5550e+01,
        5.2000e+00, -2.2800e+01,  1.3265e+02,  1.3315e+02,  4.0500e+01,
        1.5500e+00,  8.3600e+01,  3.5350e+01,  7.0200e+01, -1.1350e+01,
       -6.4150e+01, -1.0265e+02,  1.2900e+02, -3.8200e+01,  1.2495e+02,
        5.3900e+01,  4.5250e+01,  4.4050e+01, -6.9250e+01, -4.5950e+01,
        6.7950e+01, -6.1450e+01,  6.7450e+01, -8.7650e+01,  1.24

In [161]:
np.sum((nifty.values[1:]-nifty.values[:-1])>0) / len(nifty)

0.5265306122448979

Moving average of last five days?

In [162]:
nifty.index[0]

'01-Jan-2019'

In [163]:
pd.Timestamp(nifty.index[0])

Timestamp('2019-01-01 00:00:00')

In [164]:
d = pd.Timestamp(nifty.index[0])
d.dayofweek

1

In [165]:
new_index = map(pd.Timestamp, nifty.index)
new_index

<map at 0x2b1be7c7748>

In [166]:
new_nifty = pd.Series(nifty.values, index = new_index) 
new_nifty.head(5)

2019-01-01    10910.10
2019-01-02    10792.50
2019-01-03    10672.25
2019-01-04    10727.35
2019-01-07    10771.80
dtype: float64

In [168]:
new_nifty.rolling(5).mean()

2019-01-01         NaN
2019-01-02         NaN
2019-01-03         NaN
2019-01-04         NaN
2019-01-07    10774.80
2019-01-08    10753.21
2019-01-09    10765.74
2019-01-10    10795.61
2019-01-11    10809.13
2019-01-14    10802.29
2019-01-15    10819.22
2019-01-16    10826.25
2019-01-17    10842.97
2019-01-18    10865.37
2019-01-21    10910.22
2019-01-22    10917.41
2019-01-23    10905.65
2019-01-24    10894.57
2019-01-25    10869.29
2019-01-28    10809.23
2019-01-29    10755.12
2019-01-30    10719.18
2019-01-31    10715.41
2019-02-01    10738.03
2019-02-04    10788.17
2019-02-05    10844.60
2019-02-06    10926.73
2019-02-07    10974.42
2019-02-08    10984.41
2019-02-11    10979.72
                ...   
2019-11-19    11886.52
2019-11-20    11918.25
2019-11-21    11937.51
2019-11-22    11941.30
2019-11-25    11979.15
2019-11-26    11998.67
2019-11-27    12018.99
2019-11-28    12055.54
2019-11-29    12083.87
2019-12-02    12078.76
2019-12-03    12070.06
2019-12-04    12058.56
2019-12-05 

Subset the data to include only data for Fridays

In [146]:
dow = new_nifty.copy()
for i in dow.index:
    dow[i]=i.dayofweek
dow.head(5)

2019-01-01    1.0
2019-01-02    2.0
2019-01-03    3.0
2019-01-04    4.0
2019-01-07    0.0
dtype: float64

In [147]:
new_nifty[dow == 4]

2019-01-04    10727.35
2019-01-11    10794.95
2019-01-18    10906.95
2019-01-25    10780.55
2019-02-01    10893.65
2019-02-08    10943.60
2019-02-15    10724.40
2019-02-22    10791.65
2019-03-01    10863.50
2019-03-08    11035.40
2019-03-15    11426.85
2019-03-22    11456.90
2019-03-29    11623.90
2019-04-05    11665.95
2019-04-12    11643.45
2019-04-26    11754.65
2019-05-03    11712.25
2019-05-10    11278.90
2019-05-17    11407.15
2019-05-24    11844.10
2019-05-31    11922.80
2019-06-07    11870.65
2019-06-14    11823.30
2019-06-21    11724.10
2019-06-28    11788.85
2019-07-05    11811.15
2019-07-12    11552.50
2019-07-19    11419.25
2019-07-26    11284.30
2019-08-02    10997.35
2019-08-09    11109.65
2019-08-16    11047.80
2019-08-23    10829.35
2019-08-30    11023.25
2019-09-06    10946.20
2019-09-13    11075.90
2019-09-20    11274.20
2019-09-27    11512.40
2019-10-04    11174.75
2019-10-11    11305.05
2019-10-18    11661.85
2019-10-25    11583.90
2019-11-01    11890.60
2019-11-08 

In [148]:
p = pd.Series([0,1,1,2,3,4,5,8,46,15],index=[1,2,3,4,5,6,7,8,9,10])

In [154]:
p.diff() # Series.diff() subtracts the previous element (n-1) from current element n; (first element will be ‘nan’). 

1      NaN
2      1.0
3      0.0
4      1.0
5      1.0
6      1.0
7      1.0
8      3.0
9     38.0
10   -31.0
dtype: float64

In [153]:
(p>2).any() #Series.any() checks if any of the values in series are True.

True

# Pandas series don't accept set values as they can't be indexed