In [1]:
import pandas as pd
import numpy as np

## Statistical Methods
when we load the data from the dataframe we want to be able to get a brief idea about the dataframe

In [3]:
df = pd.DataFrame(np.arange(16).reshape(4,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [10]:
# calculate sum of all values
print(f'Sum: {df.sum()}')

# sum of row
print(f'Sum of row: {df.sum(axis=1)}')

# calculate the mean values
print(f'Mean: {df.mean()}')

# calculating the minimum of elements
print(f'Min: {df.min()}')
print(f'Max: {df.max()}')

# counting the number of element
print(f'Numbers: {df.count()}')

# get details of dataframe
df.describe()





Sum: 0    24
1    28
2    32
3    36
dtype: int64
Sum of row: 0     6
1    22
2    38
3    54
dtype: int64
Mean: 0    6.0
1    7.0
2    8.0
3    9.0
dtype: float64
Min: 0    0
1    1
2    2
3    3
dtype: int32
Max: 0    12
1    13
2    14
3    15
dtype: int32
Numbers: 0    4
1    4
2    4
3    4
dtype: int64


Unnamed: 0,0,1,2,3
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


## Handling NaN values
Missing data are those Nan values which you have in your dataset
There is a siple method called dropna method which drops nan values from any dataset

In [11]:
se = pd.Series([1,2,3,np.nan,4,np.nan,5])
se

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    NaN
6    5.0
dtype: float64

In [12]:
se.dropna()

0    1.0
1    2.0
2    3.0
4    4.0
6    5.0
dtype: float64

In [14]:
df = pd.DataFrame([[1,2,3],[np.nan,np.nan,np.nan],[2,np.nan,4],[np.nan,3,np.nan]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,,
2,2.0,,4.0
3,,3.0,


In [18]:
df.dropna(how='all') # 'all' means drop only those rows where all values of the rows are Nan

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
2,2.0,,4.0
3,,3.0,


In [19]:
df[3] = np.NaN
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,,,,
2,2.0,,4.0,
3,,3.0,,


In [21]:
df = df.dropna(how='all',axis=1)

In [24]:
df = df.dropna(how='all')

In [25]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
2,2.0,0.0,4.0
3,0.0,3.0,0.0


## Hierarchical Indexing
**Why multi-indexing or hierarchical indexing is required?**
* Reason 1:
  * It allows us to work with higher-dimensional data like a series, a single level of index is sufficient
  * While working with single dimensional data like a series, a single level of index is sufficient
  * However, as you move to multi-dimensional data structure, we need to use multi-level indexing
* Reason 2:
  * It enables us to store and manipulate data with arbitrary number of dimensions in a lower dimensional data structures like a series and a dataframe
  * Technically a series can only be used to store 1 dimensional data in a dataframe, but with multi-indexing wee can also store 2d data in a series
  * Similarly we can only have 2 dimensional data in a dataframe, but with multi-indexing we can also store 3 dimensional data in a dataframe


In [27]:
df = pd.Series(np.arange(10),index=[['a','a','b','b','b','c','c','d','e','e'],[1,2,1,2,3,1,2,1,1,2]])
df

a  1    0
   2    1
b  1    2
   2    3
   3    4
c  1    5
   2    6
d  1    7
e  1    8
   2    9
dtype: int32

In [28]:
df.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 1),
            ('e', 1),
            ('e', 2)],
           )

In [29]:
df.unstack() # convert this 2D series into a dataframe

Unnamed: 0,1,2,3
a,0.0,1.0,
b,2.0,3.0,4.0
c,5.0,6.0,
d,7.0,,
e,8.0,9.0,
