### Pandas
Pandas is a opensource library providing data structures for analysis


#### Series
* A Series can be described as a single column of a 2D array or a matrix. 
* It has specific index values attached to each row for identification.
* Index values are automatically generated. It can also be explicitly defined.

In [1]:
#import pandas
import pandas as pd
import numpy as np

In [2]:
srs = pd.Series([10,20,30,40])

In [3]:
#printing series values
print("The series values are\n")
print(srs.values)

The series values are

[10 20 30 40]


In [4]:
#printing series index
print("The series index values are\n")
print(srs.index.values)

The series index values are

[0 1 2 3]


In [5]:
grate = pd.Series([11.2,36.0,16.6,21.8,34.2],index=['China','India','USA','Brazil','Pakistan'])

In [6]:
grate.name = 'Growth Rate' #set series name
grate.index.name = 'Country' # set series index name

In [7]:
print("The indexed series values are\n")
print(grate)

The indexed series values are

Country
China       11.2
India       36.0
USA         16.6
Brazil      21.8
Pakistan    34.2
Name: Growth Rate, dtype: float64


In [10]:
print(grate.values)
print(grate.name)

[11.2 36.  16.6 21.8 34.2]
Growth Rate


In [11]:
print(grate.index.values)
print(grate.index.name)

['China' 'India' 'USA' 'Brazil' 'Pakistan']
Country


##### If there are 2 series objects with same index names, some operations can be performed on them, and its done based on the values with same index names

In [16]:
grate1 = pd.Series([11.2,36.0,16.6,21.8,34.2], index = ['China','India','Brazil','USA','Pakistan'])
grate2 = pd.Series([20.3,11.2,36.0,16.6,21.8,8.7], index = ['Africa','China','India','Brazil','USA','Nigeria'])

In [17]:
result = grate1/grate2

In [18]:
result.name = "Result"
result.index.name = 'Country'

In [19]:
print(result)

Country
Africa      NaN
Brazil      1.0
China       1.0
India       1.0
Nigeria     NaN
Pakistan    NaN
USA         1.0
Name: Result, dtype: float64


In [20]:
print(pd.isnull(result))

Country
Africa       True
Brazil      False
China       False
India       False
Nigeria      True
Pakistan     True
USA         False
Name: Result, dtype: bool


In [22]:
print(pd.notnull(result))

Country
Africa      False
Brazil       True
China        True
India        True
Nigeria     False
Pakistan    False
USA          True
Name: Result, dtype: bool


In [23]:
print(result[result == 1.0]) #print indexes that satisfy the condition

Country
Brazil    1.0
China     1.0
India     1.0
USA       1.0
Name: Result, dtype: float64


In [24]:
print(result[result != 1.0])

Country
Africa     NaN
Nigeria    NaN
Pakistan   NaN
Name: Result, dtype: float64


#### Dataframe
* A dataframe can be described as a 2D array or matrix.
* It has data stored in it just like an excel sheet with multiple rows and columns
* Each dataframe object is associated with both a row index and column index.

In [37]:
#creating a dataframe
df=pd.DataFrame(np.arange(0,12).reshape(4,3),index=['Row1','Row2','Row3','Row4'],columns=['Col1','Col2','Col3'])

In [4]:
df.head()

Unnamed: 0,Col1,Col2,Col3
Row1,0,1,2
Row2,3,4,5
Row3,6,7,8
Row4,9,10,11


In [7]:
df.to_csv('Test.csv')

In [5]:
##Accessing elements of the dataframe
df.loc['Row1']

Col1    0
Col2    1
Col3    2
Name: Row1, dtype: int32

In [8]:
type(df.loc['Row2'])

pandas.core.series.Series

In [12]:
df.iloc[:,:]

Unnamed: 0,Col1,Col2,Col3
Row1,0,1,2
Row2,3,4,5
Row3,6,7,8
Row4,9,10,11


In [13]:
df.iloc[0:1,0:2]

Unnamed: 0,Col1,Col2
Row1,0,1


In [14]:
type(df.iloc[0:1,0:2])

pandas.core.frame.DataFrame

In [15]:
df.iloc[:2,:3]

Unnamed: 0,Col1,Col2,Col3
Row1,0,1,2
Row2,3,4,5


In [16]:
df.iloc[1:,:3]

Unnamed: 0,Col1,Col2,Col3
Row2,3,4,5
Row3,6,7,8
Row4,9,10,11


In [29]:
#To index by column
df['Col1']

Row1    0
Row2    3
Row3    6
Row4    9
Name: Col1, dtype: int32

In [30]:
type(df['Col1'])

pandas.core.series.Series

In [31]:
df[['Col1','Col2']]

Unnamed: 0,Col1,Col2
Row1,0,1
Row2,3,4
Row3,6,7
Row4,9,10


In [19]:
##Converting dataframe to array
df.iloc[1:,:3].values

array([[ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [20]:
type(df.iloc[1:,:3].values)

numpy.ndarray

In [21]:
#see shape of array
df.iloc[1:,:3].values.shape

(3, 3)

In [22]:
df.isnull()

Unnamed: 0,Col1,Col2,Col3
Row1,False,False,False
Row2,False,False,False
Row3,False,False,False
Row4,False,False,False


In [23]:
df.isnull().sum()

Col1    0
Col2    0
Col3    0
dtype: int64

In [27]:
df['Col1'].value_counts()

6    1
3    1
9    1
0    1
Name: Col1, dtype: int64

In [28]:
df['Col1'].unique()

array([0, 3, 6, 9])

### Reading CSV

In [4]:
df1=pd.read_csv('E:\Coursera\Python\Dataset\MercedesBenz.csv')

In [5]:
df1.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
test_df=pd.read_csv('Test1.csv',sep=';')

In [8]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,Col1,Col2,Col3
0,Row1,0,1,2
1,Row2,3,4,5
2,Row3,6,7,8
3,Row4,9,10,11


### Accessing file

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries, ID to X385
dtypes: int64(369), object(8)
memory usage: 12.1+ MB


In [11]:
df1.describe()

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4211.039202,0.019007,0.000238,0.074364,0.06106,0.427893,0.000713,0.002613,0.008791,0.010216,...,0.325968,0.049656,0.311951,0.019244,0.011879,0.008078,0.008791,0.000475,0.000713,0.001663
std,2423.078926,0.136565,0.015414,0.262394,0.239468,0.494832,0.026691,0.051061,0.093357,0.10057,...,0.468791,0.217258,0.463345,0.137399,0.108356,0.089524,0.093357,0.021796,0.026691,0.040752
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4202.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6310.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8416.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
df1['X0'].value_counts()

ak    432
y     348
z     335
x     302
ay    299
t     293
o     246
f     213
w     198
j     171
n     167
aj    162
az    161
s     116
ap    108
al     88
h      64
d      61
e      48
v      40
ai     38
af     34
m      34
am     28
i      25
at     21
u      20
ba     19
a      18
b      13
k      12
ad     12
aq     11
aw     11
r      10
ax      8
as      6
c       6
l       6
bc      6
au      5
ao      5
g       3
ae      1
bb      1
ag      1
an      1
av      1
p       1
Name: X0, dtype: int64

#### Learning about CSV

In [6]:
from io import StringIO, BytesIO

In [18]:
data=('col1,col2,col3,col4\n'
     'x,1,2,3\n'
     'y,4,5,6\n'
     'z,7,8,9')

In [19]:
type(data)

str

In [27]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3,col4
0,x,1,2,3
1,y,4,5,6
2,z,7,8,9


In [26]:
#### To read from specific columns
df2=pd.read_csv(StringIO(data), usecols=['col1','col3','col4'])

In [28]:
df2.to_csv('Test3')

In [29]:
print(data)

col1,col2,col3,col4
x,1,2,3
y,4,5,6
z,7,8,9


In [30]:
df2_obj=pd.read_csv(StringIO(data),dtype=object)

In [31]:
type(df2_obj)

pandas.core.frame.DataFrame

In [32]:
df2_obj['col1']

0    x
1    y
2    z
Name: col1, dtype: object

In [33]:
df2_obj['col1'][1]

'y'

In [8]:
data1=('a,b,c,d\n'
      '1,2,3,4\n'
      '5,6,7,8\n'
      '9,10,11,12')

In [36]:
df2_int=pd.read_csv(StringIO(data1),dtype=int)

In [37]:
df2_int['b']

0     2
1     6
2    10
Name: b, dtype: int32

In [38]:
df2_int['b'][2]

10

In [39]:
df2_float=pd.read_csv(StringIO(data1),dtype=float)

In [40]:
df2_float['b']

0     2.0
1     6.0
2    10.0
Name: b, dtype: float64

In [41]:
print(df2_float)

     a     b     c     d
0  1.0   2.0   3.0   4.0
1  5.0   6.0   7.0   8.0
2  9.0  10.0  11.0  12.0


### assigning different data types for columns

In [9]:
df3=pd.read_csv(StringIO(data1),dtype={'a':int,'b':float,'c':str,'d':'Int64'})

In [10]:
df3

Unnamed: 0,a,b,c,d
0,1,2.0,3,4
1,5,6.0,7,8
2,9,10.0,11,12


In [11]:
type(df3['a'])

pandas.core.series.Series

In [12]:
type(df3['a'][1])

numpy.int32

In [13]:
type(df3['d'][2])

numpy.int64

In [14]:
type(df3['c'][0])

str

In [15]:
##Check the data type of columns
df3.dtypes

a      int32
b    float64
c     object
d      Int64
dtype: object

In [16]:
phonics_data= ('index,a,e,i\n'
              '1,pat,pet,pit\n'
              '2,sat,set,sit\n'
               '3,bat,bet,bit')

In [17]:
phonics_data

'index,a,e,i\n1,pat,pet,pit\n2,sat,set,sit\n3,bat,bet,bit'

In [18]:
pd.read_csv(StringIO(phonics_data))

Unnamed: 0,index,a,e,i
0,1,pat,pet,pit
1,2,sat,set,sit
2,3,bat,bet,bit


In [20]:
phonics=pd.read_csv(StringIO(phonics_data),index_col=0)

In [21]:
phonics

Unnamed: 0_level_0,a,e,i
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,pat,pet,pit
2,sat,set,sit
3,bat,bet,bit


In [22]:
phonics['index']

KeyError: 'index'

In [23]:
phonics['a']

index
1    pat
2    sat
3    bat
Name: a, dtype: object

In [24]:
phonics['a'][3]

'bat'

In [25]:
phonics

Unnamed: 0_level_0,a,e,i
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,pat,pet,pit
2,sat,set,sit
3,bat,bet,bit


In [26]:
phonics['a'][1]

'pat'

In [32]:
phonics=pd.read_csv(StringIO(phonics_data),usecols=['e','i'],index_col=False)

In [33]:
phonics

Unnamed: 0,e,i
0,pet,pit
1,set,sit
2,bet,bit


## Read JSON to CSV

In [36]:
Data='{"name":"Adheesh", "email":"adheesh@email.com", "job":[{"Title1":"Team Lead", "Title2":"Sr.Developer"}]}'

In [37]:
pd.read_json(Data)

Unnamed: 0,name,email,job
0,Adheesh,adheesh@email.com,"{'Title1': 'Team Lead', 'Title2': 'Sr.Developer'}"


### Reindexing
* This method allows for adding new indexes and columns in series and dataframe without disturbing the initial setting of objects
* the function used is reindex().
* It is called by a series or dataframe object and a list of indexes is passed as a parameter.

In [25]:
print(grate1)
print(grate2)

China       11.2
India       36.0
Brazil      16.6
USA         21.8
Pakistan    34.2
dtype: float64
Africa     20.3
China      11.2
India      36.0
Brazil     16.6
USA        21.8
Nigeria     8.7
dtype: float64


In [28]:
grate3 = grate2.reindex(['China','India','Malaysia','USA','Brazil','Pakistan','England'])
print(grate3)

China       11.2
India       36.0
Malaysia     NaN
USA         21.8
Brazil      16.6
Pakistan     NaN
England      NaN
dtype: float64


In [29]:
grate4 = grate2.reindex(['China','India','Malaysia','USA','Brazil','Pakistan','England'],fill_value=0)
print(grate4)

China       11.2
India       36.0
Malaysia     0.0
USA         21.8
Brazil      16.6
Pakistan     0.0
England      0.0
dtype: float64


#### reindexing in dataframe

In [31]:
arr=np.arange(16).reshape(4,4)
print(arr)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]


In [39]:
dframe1 = pd.DataFrame(arr, index=['Row1','Row3','Row4','Row5'], columns=['Col1','Col2','Col3','Col4'])
print(dframe1)

      Col1  Col2  Col3  Col4
Row1     0     1     2     3
Row3     4     5     6     7
Row4     8     9    10    11
Row5    12    13    14    15


In [44]:
dframe2 = dframe1.reindex(['Row1','Row2','Row3','Row4','Row5'])
print(dframe2)

      Col1  Col2  Col3  Col4
Row1   0.0   1.0   2.0   3.0
Row2   NaN   NaN   NaN   NaN
Row3   4.0   5.0   6.0   7.0
Row4   8.0   9.0  10.0  11.0
Row5  12.0  13.0  14.0  15.0


In [45]:
dframe2 = dframe2.reindex(columns=['Col0','Col1','Col2','Col4'])
print(dframe2)

      Col0  Col1  Col2  Col4
Row1   NaN   0.0   1.0   3.0
Row2   NaN   NaN   NaN   NaN
Row3   NaN   4.0   5.0   7.0
Row4   NaN   8.0   9.0  11.0
Row5   NaN  12.0  13.0  15.0


#### Forward filling

In [46]:
#forward filling in series
print(grate3)

China       11.2
India       36.0
Malaysia     NaN
USA         21.8
Brazil      16.6
Pakistan     NaN
England      NaN
dtype: float64


In [48]:
print(grate3.ffill())

China       11.2
India       36.0
Malaysia    36.0
USA         21.8
Brazil      16.6
Pakistan    16.6
England     16.6
dtype: float64


In [49]:
#forward filling in dataframes
print(dframe2)

      Col0  Col1  Col2  Col4
Row1   NaN   0.0   1.0   3.0
Row2   NaN   NaN   NaN   NaN
Row3   NaN   4.0   5.0   7.0
Row4   NaN   8.0   9.0  11.0
Row5   NaN  12.0  13.0  15.0


In [50]:
#Fill values rowwise (axis=0 specifies its rowwise filling)
print(dframe2.ffill(axis=0))

      Col0  Col1  Col2  Col4
Row1   NaN   0.0   1.0   3.0
Row2   NaN   0.0   1.0   3.0
Row3   NaN   4.0   5.0   7.0
Row4   NaN   8.0   9.0  11.0
Row5   NaN  12.0  13.0  15.0


In [52]:
dframe2=dframe2.reindex(columns=['Col1','Col2','Col3','Col4'])
print(dframe2)

      Col1  Col2  Col3  Col4
Row1   0.0   1.0   NaN   3.0
Row2   NaN   NaN   NaN   NaN
Row3   4.0   5.0   NaN   7.0
Row4   8.0   9.0   NaN  11.0
Row5  12.0  13.0   NaN  15.0


In [53]:
print(dframe2.ffill(axis=1))

      Col1  Col2  Col3  Col4
Row1   0.0   1.0   1.0   3.0
Row2   NaN   NaN   NaN   NaN
Row3   4.0   5.0   5.0   7.0
Row4   8.0   9.0   9.0  11.0
Row5  12.0  13.0  13.0  15.0


#### Selecting and Dropping indexes in series and dataframe

In [59]:
print(grate)

Country
China       11.2
India       36.0
USA         16.6
Brazil      21.8
Pakistan    34.2
Name: Growth Rate, dtype: float64


In [60]:
grate['India']

36.0

In [61]:
grate[2]

16.6

In [62]:
grate[['India','USA']]

Country
India    36.0
USA      16.6
Name: Growth Rate, dtype: float64

In [64]:
s1=grate[['India','USA']]
type(s1)

pandas.core.series.Series

In [65]:
grate.drop('China')

Country
India       36.0
USA         16.6
Brazil      21.8
Pakistan    34.2
Name: Growth Rate, dtype: float64

#### an index can only be dropped by an index name and not by index number
grate.drop(grate[2]) ---> doesn't work

In [67]:
print(dframe1)

      Col1  Col2  Col3  Col4
Row1     0     1     2     3
Row3     4     5     6     7
Row4     8     9    10    11
Row5    12    13    14    15


In [68]:
dframe1['Col3']

Row1     2
Row3     6
Row4    10
Row5    14
Name: Col3, dtype: int32

In [69]:
dframe1[dframe1['Col3']>5]

Unnamed: 0,Col1,Col2,Col3,Col4
Row3,4,5,6,7
Row4,8,9,10,11
Row5,12,13,14,15


In [71]:
dframe1.drop('Row5')

Unnamed: 0,Col1,Col2,Col3,Col4
Row1,0,1,2,3
Row3,4,5,6,7
Row4,8,9,10,11


In [73]:
dframe1.drop('Col2',axis=1)
#axis=1 needs to be specified to let know that its column index

Unnamed: 0,Col1,Col3,Col4
Row1,0,2,3
Row3,4,6,7
Row4,8,10,11
Row5,12,14,15


#### Sorting in Pandas
 * Sorts a series in ascending order.
 * Both the indexes and values can be sorted.
 * sort_index() ---> sorts the index in ascending order
 * sort_values() ---> sorts the values in ascending order

In [7]:
print("Sorting numeric index")
num_series = pd.Series([1,4,5,8,3],index=[3,1,2,4,5])
print(num_series.sort_index())

Sorting numeric index
1    4
2    5
3    1
4    8
5    3
dtype: int64


In [8]:
print("Sorting alphabetic index")
alpha_series = pd.Series([1,4,5,8,3],index=['C','B','D','A','E'])
print(alpha_series.sort_index())

Sorting alphabetic index
A    8
B    4
C    1
D    5
E    3
dtype: int64


In [10]:
print("Sorting alphanumeric index")
alphanum_series = pd.Series([1,4,5,8,3],index=['1','5','A','C','B'])
print(alphanum_series.sort_index())

Sorting alphanumeric index
1    1
5    4
A    5
B    3
C    8
dtype: int64


In [11]:
print("Sorting numeric values")
num_series1 = pd.Series(np.random.randn(5))
print(num_series1.sort_values())

Sorting numeric values
0   -2.036619
2   -0.963085
1    0.079905
4    0.267093
3    0.953537
dtype: float64


In [12]:
print("Sorting alphabetic values")
alpha_series1 = pd.Series(['E','X','S','A','M'])
print(alpha_series1.sort_values())

Sorting alphabetic values
3    A
0    E
4    M
2    S
1    X
dtype: object


##### Rank
* Rank is basically the positioning of indexes according to the sorted values of a series.
* Rank is assigned to indexes from 1 to n based on the value corresponding to the index.

In [15]:
srs1 = pd.Series(np.random.randn(5))
print("Series before ranking\n",srs1)

Series before ranking
 0   -0.548893
1   -0.640953
2   -0.654400
3    0.206535
4    0.610313
dtype: float64


In [17]:
print("Rank before sorting")
print(srs1.rank())

Rank before sorting
0    3.0
1    2.0
2    1.0
3    4.0
4    5.0
dtype: float64


In [18]:
srs1 = srs1.sort_values()

In [20]:
print("Rank after sorting\n")
print(srs1.rank())

Rank after sorting

2    1.0
1    2.0
0    3.0
3    4.0
4    5.0
dtype: float64


#### Functions to deal with missing data
* isnull() - returns an object where all instances of NaN marked as TRUE and the rest as FALSE.

In [22]:
null_val = np.nan
srs = pd.Series(['A','B','C',null_val]) #Declaring series with null values
print("The series with null values\n",srs)

The series with null values
 0      A
1      B
2      C
3    NaN
dtype: object


In [23]:
#Checking null
print(srs.isnull())

0    False
1    False
2    False
3     True
dtype: bool


In [25]:
## Declaring dataframe with null values
df = pd.DataFrame([['A','B','c',null_val],[null_val,1,5,3],['D',null_val,'F','G']])
print("Dataframe with NaN values")
print(df)

Dataframe with NaN values
     0    1  2    3
0    A    B  c  NaN
1  NaN    1  5    3
2    D  NaN  F    G


In [26]:
#checking dataframe for null
print(df.isnull())

       0      1      2      3
0  False  False  False   True
1   True  False  False  False
2  False   True  False  False


#### Dropping NaN values
* This function drops the NaN values based on the different parameters that we define.
* dropna() - deletes all rows containing atleast one NaN value
* dropna(how='all') --> deletes rows where all values are  NaN
* dropna(axis=1) --> deletes columns where all values are NaN
* dropna(thresh=n) --> deletes rows that contains less than 'n' number of non NaN values

In [27]:
df1=pd.DataFrame([[1,2,3,4,5],[6,null_val,7,null_val,9],[null_val,'D','C',null_val,'E'],[null_val,null_val,null_val,null_val,null_val]])
print("Original dataframe")
print(df1)

Original dataframe
     0    1    2    3    4
0  1.0    2    3  4.0    5
1  6.0  NaN    7  NaN    9
2  NaN    D    C  NaN    E
3  NaN  NaN  NaN  NaN  NaN


In [28]:
#Dropping all null values
print(df1.dropna())

     0  1  2    3  4
0  1.0  2  3  4.0  5


In [29]:
#Dropping rows where all values are nan
print(df1.dropna(how='all'))

     0    1  2    3  4
0  1.0    2  3  4.0  5
1  6.0  NaN  7  NaN  9
2  NaN    D  C  NaN  E


In [30]:
#Dropping columns where all values are nan
print(df1.dropna(axis=1))

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]


In [31]:
#Dropping rows that contain less than 3 non nan values
print(df1.dropna(thresh=3))

     0    1  2    3  4
0  1.0    2  3  4.0  5
1  6.0  NaN  7  NaN  9
2  NaN    D  C  NaN  E


#### fillna(n) ---> replaces all NaN values with defined values passed as parameter
* Any alphanumeric value can be assigned

In [32]:
print("Original dataframe")
print(df1)

Original dataframe
     0    1    2    3    4
0  1.0    2    3  4.0    5
1  6.0  NaN    7  NaN    9
2  NaN    D    C  NaN    E
3  NaN  NaN  NaN  NaN  NaN


In [33]:
#Replacing nan with M
print(df1.fillna('M'))

   0  1  2  3  4
0  1  2  3  4  5
1  6  M  7  M  9
2  M  D  C  M  E
3  M  M  M  M  M


#### sum() function
* sum(axis=0) ---> calculates sum of each column of dataframe
* sum(axis=1) ---> calculates sum of each row of dataframe


In [34]:
df2 = pd.DataFrame(np.arange(0,9).reshape(3,3),index=['A','B','C'],columns=['A','B','C'])
print("The dataframe")
print(df2)

The dataframe
   A  B  C
A  0  1  2
B  3  4  5
C  6  7  8


In [35]:
#sum of each column 
print(df2.sum(axis=0))

A     9
B    12
C    15
dtype: int64


In [36]:
#sum of each row
print(df2.sum(axis=1))

A     3
B    12
C    21
dtype: int64


#### Finding minimum values
* min(axis=0) --> returns mininum value from each column
* min(axis=1) --> returns minimum value from each row

In [38]:
print("The dataframe")
print(df2)

The dataframe
   A  B  C
A  0  1  2
B  3  4  5
C  6  7  8


In [39]:
print(df2.min(axis=0))

A    0
B    1
C    2
dtype: int32


In [40]:
print(df2.min(axis=1))

A    0
B    3
C    6
dtype: int32


* idxmin(axis=0) --> returns the index with minimum value from each column
* idxmin(axis=1) --> returns the column with minimum value from each index

In [41]:
print("The dataframe")
df3=pd.DataFrame(np.arange(0,9).reshape(3,3),index=['A','B','C'],columns=['Col1','Col2','Col3'])
print(df3)

The dataframe
   Col1  Col2  Col3
A     0     1     2
B     3     4     5
C     6     7     8


In [42]:
#find index with minimum value from each column
print(df3.idxmin(axis=0))

Col1    A
Col2    A
Col3    A
dtype: object


In [43]:
#find column with minimum value from each index
print(df3.idxmin(axis=1))

A    Col1
B    Col1
C    Col1
dtype: object


### Multilevel indexing
* here one index refers to one or more indexes, and those indexes further refer to values

##### Multilevel indexing in series

In [45]:
srs2=pd.Series(np.arange(5),index=[['A','A','B','C','C'],[1,2,3,4,5]])
print(srs2)

A  1    0
   2    1
B  3    2
C  4    3
   5    4
dtype: int32


In [46]:
print("The A index") #Fetching elements at A index
print(srs2['A'])

The A index
1    0
2    1
dtype: int32


In [47]:
print("The B index")
print(srs2['B'])

The B index
3    2
dtype: int32


In [48]:
print(srs2['B'][3])

2


#### Multilevel indexing in dataframe

In [49]:
df4=pd.DataFrame(np.arange(25).reshape(5,5),index=[['A','A','A','B','B'],[1,2,3,4,5]],columns=[['USA','Pak','Pak','UK','Ind'],['Day','Day','Night','Night','Night']])
print("Multilevel index in dataframe")
print(df4)

Multilevel index in dataframe
    USA Pak          UK   Ind
    Day Day Night Night Night
A 1   0   1     2     3     4
  2   5   6     7     8     9
  3  10  11    12    13    14
B 4  15  16    17    18    19
  5  20  21    22    23    24


In [52]:
print(df4['USA'])

     Day
A 1    0
  2    5
  3   10
B 4   15
  5   20


In [55]:
print(df4['Pak']['Day'])

A  1     1
   2     6
   3    11
B  4    16
   5    21
Name: Day, dtype: int32


In [56]:
print(df4['Pak']['Day']['A'])

1     1
2     6
3    11
Name: Day, dtype: int32


In [57]:
print(df4['USA']['Day']['A'])

1     0
2     5
3    10
Name: Day, dtype: int32


In [58]:
print(df4['USA']['Day']['A'][3])

10
