In [1]:
import pandas as pd
import numpy as np

In [2]:
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [3]:
ufo.tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00
18238,Eagle River,,,WI,12/31/2000 23:45
18239,Eagle River,RED,LIGHT,WI,12/31/2000 23:45
18240,Ybor,,OVAL,FL,12/31/2000 23:59


NaN - Not a number  -> It means data is missing

## df.isnull()   - method to detect NA/NaN

In [6]:
ufo.isnull().tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,False,True,False,False,False
18237,False,True,False,False,False
18238,False,True,True,False,False
18239,False,False,False,False,False
18240,False,True,False,False,False


#### Inverse method to detect numbers that are not null -> df.notnull()

In [7]:
ufo.notnull().tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,True,False,True,True,True
18237,True,False,True,True,True
18238,True,False,False,True,True
18239,True,True,True,True,True
18240,True,False,True,True,True


## To get number of missing values in each column of df

In [8]:
# Very Imp as it shows were the missing values are
ufo.isnull().sum()

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [9]:
ufo.isnull().sum(axis=0)   # axis = 0 ->down the columns

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

__isnull() :__ is a series method. It can also be used to filter dataframes

In [11]:
ufo[ufo['City'].isnull()].head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
21,,,,LA,8/15/1943 0:00
22,,,LIGHT,LA,8/15/1943 0:00
204,,,DISK,CA,7/15/1952 12:30
241,,BLUE,DISK,MT,7/4/1953 14:00
613,,,DISK,NV,7/1/1960 12:00


In [13]:
ufo.loc[ufo['City'].isnull(),:].head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
21,,,,LA,8/15/1943 0:00
22,,,LIGHT,LA,8/15/1943 0:00
204,,,DISK,CA,7/15/1952 12:30
241,,BLUE,DISK,MT,7/4/1953 14:00
613,,,DISK,NV,7/1/1960 12:00


### How to drop a row from a dataframe if any of its columns values are missing?

In [14]:
ufo.shape

(18241, 5)

In [17]:
ufo.dropna(how = 'any').shape

(2486, 5)

`how = 'any'` comes by default for dropna()

In [18]:
ufo.dropna().shape

(2486, 5)

#### How to drop rows of a df based on NA in any 1 or more col

In [19]:
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [22]:
# To filter out rows based on values in 1 col
ufo.dropna(subset = ['Colors Reported']).head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
12,Belton,RED,SPHERE,SC,6/30/1939 20:00
19,Bering Sea,RED,OTHER,AK,4/30/1943 23:00
36,Portsmouth,RED,FORMATION,VA,7/10/1945 1:30
44,Blairsden,GREEN,SPHERE,CA,6/30/1946 19:00
66,Wexford,BLUE,,PA,7/1/1947 20:00


In [23]:
# To filter out rows based on values in 2 cols
ufo.dropna(subset = ['City','Colors Reported']).head(10)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
12,Belton,RED,SPHERE,SC,6/30/1939 20:00
19,Bering Sea,RED,OTHER,AK,4/30/1943 23:00
36,Portsmouth,RED,FORMATION,VA,7/10/1945 1:30
44,Blairsden,GREEN,SPHERE,CA,6/30/1946 19:00
66,Wexford,BLUE,,PA,7/1/1947 20:00
82,San Jose,BLUE,CHEVRON,CA,7/15/1947 21:00
84,Modesto,BLUE,DISK,CA,8/8/1947 22:00
91,Scipio,RED,SPHERE,IN,5/10/1948 19:00
111,Tarrant City,ORANGE,CIRCLE,AL,8/15/1949 22:00
120,Roswell,RED,,NM,3/22/1950 0:00


In [26]:
ufo.dropna(subset = ['City','Colors Reported'], how='any').shape  # If any of the column values are NA

(2877, 5)

In [25]:
ufo.dropna(subset = ['City','Colors Reported'], how='all').shape   # drop if both have NA

(18221, 5)

In [27]:
ufo.dropna(how='all').shape  # drop if all values in a column are NA

(18241, 5)

#### How to replace NA value in a column with any other

In [29]:
ufo['Colors Reported'].fillna(value = 'WHATEVER').value_counts()

WHATEVER                  15359
RED                         780
GREEN                       531
ORANGE                      528
BLUE                        450
YELLOW                      169
RED GREEN                    89
RED BLUE                     78
RED ORANGE                   44
GREEN BLUE                   34
RED GREEN BLUE               33
ORANGE YELLOW                26
RED YELLOW                   25
ORANGE GREEN                 23
YELLOW GREEN                 17
ORANGE BLUE                  10
RED YELLOW GREEN              9
YELLOW BLUE                   6
ORANGE GREEN BLUE             5
YELLOW GREEN BLUE             5
RED YELLOW GREEN BLUE         4
RED ORANGE YELLOW             4
RED ORANGE GREEN              3
RED ORANGE BLUE               3
RED YELLOW BLUE               3
ORANGE YELLOW GREEN           1
RED ORANGE YELLOW BLUE        1
ORANGE YELLOW BLUE            1
Name: Colors Reported, dtype: int64

### How can I replace the nans with averages of columns where they are?

In [7]:
df = pd.DataFrame(np.random.randint(5, size = (10,4)))
df.iloc[3:5,0] = np.nan
df.iloc[4:6,1] = np.nan
df.iloc[5:8,2] = np.nan
df.columns = ['A','B','C','D']  # giving names to the columns
df

Unnamed: 0,A,B,C,D
0,2.0,0.0,1.0,3
1,2.0,2.0,4.0,0
2,0.0,2.0,2.0,1
3,,1.0,2.0,0
4,,,3.0,3
5,0.0,,,1
6,1.0,0.0,,0
7,0.0,3.0,,1
8,1.0,3.0,3.0,3
9,0.0,2.0,4.0,2


In [8]:
df.mean()   # nan values are skipped

A    0.750000
B    1.625000
C    2.714286
D    1.400000
dtype: float64

Apply per-column the mean of that columns and fill

In [9]:
df = df.apply(lambda x: x.fillna(x.mean()),axis=0)

In [10]:
df

Unnamed: 0,A,B,C,D
0,2.0,0.0,1.0,3
1,2.0,2.0,4.0,0
2,0.0,2.0,2.0,1
3,0.75,1.0,2.0,0
4,0.75,1.625,3.0,3
5,0.0,1.625,2.714286,1
6,1.0,0.0,2.714286,0
7,0.0,3.0,2.714286,1
8,1.0,3.0,3.0,3
9,0.0,2.0,4.0,2


If you want to impute missing values with mean and you want to go column by column, then this will only impute with the mean of that column. This might be a little more readable.

In [11]:
df = pd.DataFrame(np.random.randint(5, size = (10,4)))
df.iloc[3:5,0] = np.nan
df.iloc[4:6,1] = np.nan
df.iloc[5:8,2] = np.nan
df.columns = ['A','B','C','D']  # giving names to the columns
df

Unnamed: 0,A,B,C,D
0,0.0,2.0,1.0,0
1,2.0,4.0,3.0,4
2,2.0,1.0,0.0,3
3,,4.0,3.0,3
4,,,0.0,1
5,3.0,,,1
6,4.0,3.0,,1
7,0.0,0.0,,1
8,0.0,4.0,2.0,0
9,2.0,3.0,1.0,1


In [12]:
df['A'] = df['A'].fillna((df['A'].mean()))
df

Unnamed: 0,A,B,C,D
0,0.0,2.0,1.0,0
1,2.0,4.0,3.0,4
2,2.0,1.0,0.0,3
3,1.625,4.0,3.0,3
4,1.625,,0.0,1
5,3.0,,,1
6,4.0,3.0,,1
7,0.0,0.0,,1
8,0.0,4.0,2.0,0
9,2.0,3.0,1.0,1


In [13]:
# Similarly
df['B'] = df['B'].fillna((df['B'].mean()))
df['C'] = df['C'].fillna((df['C'].mean()))
df

Unnamed: 0,A,B,C,D
0,0.0,2.0,1.0,0
1,2.0,4.0,3.0,4
2,2.0,1.0,0.0,3
3,1.625,4.0,3.0,3
4,1.625,2.625,0.0,1
5,3.0,2.625,1.428571,1
6,4.0,3.0,1.428571,1
7,0.0,0.0,1.428571,1
8,0.0,4.0,2.0,0
9,2.0,3.0,1.0,1
