In [1]:
import pandas as pd
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

# Drop nan values from a series

In [2]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [3]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

# Drop nan values from dataframe

In [4]:
data = pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])

In [5]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Dropping all lines with at least one nan value

In [6]:
cleaned = data.dropna()

In [7]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Dropping the lines where all values are nan

In [9]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [10]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


Dropping the columns with all values being nan

In [11]:
data.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


# Insert NAN values into a dataframe

In [12]:
import numpy as np
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,1.413365,1.582045,-0.331536
1,0.655298,1.440612,1.152545
2,-1.644646,-1.195284,-0.559372
3,1.022671,1.149835,-0.707896
4,-0.645987,-0.38661,0.55107
5,0.857132,0.829503,0.253734
6,-0.042766,-0.683423,-1.391467


In [13]:
df.iloc[:4,1] = NA
df

Unnamed: 0,0,1,2
0,1.413365,,-0.331536
1,0.655298,,1.152545
2,-1.644646,,-0.559372
3,1.022671,,-0.707896
4,-0.645987,-0.38661,0.55107
5,0.857132,0.829503,0.253734
6,-0.042766,-0.683423,-1.391467


In [14]:
df.iloc[:2,2] = NA
df

Unnamed: 0,0,1,2
0,1.413365,,
1,0.655298,,
2,-1.644646,,-0.559372
3,1.022671,,-0.707896
4,-0.645987,-0.38661,0.55107
5,0.857132,0.829503,0.253734
6,-0.042766,-0.683423,-1.391467


In [15]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.645987,-0.38661,0.55107
5,0.857132,0.829503,0.253734
6,-0.042766,-0.683423,-1.391467


# Dropna with thresh

In [16]:
df.dropna(thresh=2)    #drop the line that have 2 or more nan values

Unnamed: 0,0,1,2
2,-1.644646,,-0.559372
3,1.022671,,-0.707896
4,-0.645987,-0.38661,0.55107
5,0.857132,0.829503,0.253734
6,-0.042766,-0.683423,-1.391467


# Forward fill ffill

In [17]:
df = pd.DataFrame(np.random.randn(6,3))
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
df

Unnamed: 0,0,1,2
0,-1.015841,-0.427256,0.044093
1,-0.579705,-0.93276,-0.515771
2,0.089524,,1.012632
3,2.404323,,-0.198186
4,2.281084,,
5,-0.395055,,


In [18]:
df.fillna(method = 'ffill')     #fill the columns with nan values with the value above

Unnamed: 0,0,1,2
0,-1.015841,-0.427256,0.044093
1,-0.579705,-0.93276,-0.515771
2,0.089524,-0.93276,1.012632
3,2.404323,-0.93276,-0.198186
4,2.281084,-0.93276,-0.198186
5,-0.395055,-0.93276,-0.198186


In [19]:
df.fillna(method = 'ffill', limit = 2)    #only 2 nan values are filled with the value above

Unnamed: 0,0,1,2
0,-1.015841,-0.427256,0.044093
1,-0.579705,-0.93276,-0.515771
2,0.089524,-0.93276,1.012632
3,2.404323,-0.93276,-0.198186
4,2.281084,,-0.198186
5,-0.395055,,-0.198186


# Fill the data based on the mean

In [20]:
data = pd.Series([1., NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [21]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# Mapping

In [22]:
data = pd.DataFrame({'city':['madrid','roma','viena','praga','bucuresti','sofia'],
                    'ounces':[4,3,12,6,7.5,8]})
data

Unnamed: 0,city,ounces
0,madrid,4.0
1,roma,3.0
2,viena,12.0
3,praga,6.0
4,bucuresti,7.5
5,sofia,8.0


In [23]:
citytogod = {'madrid':'shiv','roma':'shiv','viena':'shiv','praga':'vishnu','bucuresti':'vishnu','sofia':'vishnu'}
lowercased = data['city'].str.lower()

In [24]:
lowercased

0       madrid
1         roma
2        viena
3        praga
4    bucuresti
5        sofia
Name: city, dtype: object

In [25]:
data['city'] = lowercased.map(citytogod)

In [26]:
data

Unnamed: 0,city,ounces
0,shiv,4.0
1,shiv,3.0
2,shiv,12.0
3,vishnu,6.0
4,vishnu,7.5
5,vishnu,8.0


# Mapping with lambda

In [27]:
data = pd.DataFrame({'city':['madrid','roma','viena','praga','bucuresti','sofia'],
                    'ounces':[4,3,12,6,7.5,8]})
data

Unnamed: 0,city,ounces
0,madrid,4.0
1,roma,3.0
2,viena,12.0
3,praga,6.0
4,bucuresti,7.5
5,sofia,8.0


In [28]:
data['city'].map(lambda x: citytogod[x.lower()])

0      shiv
1      shiv
2      shiv
3    vishnu
4    vishnu
5    vishnu
Name: city, dtype: object