# Missing Data

In [2]:
# Pandas display missing values as NaN
# the NaN value in Pandas come from numpy
# missing values are different from other type of data in that they don't really equal to antyhing

## Notes:
# NaN is NOT EQUAL to a empty string, 0, True or False
# NaN is NOT EQUAL to nan or NAN (missing values are not equal to other missing values)

import numpy as np, pandas as pd

In [2]:
np.NaN == True

False

In [3]:
np.NaN == False

False

In [4]:
np.NaN == np.NAN

False

In [5]:
# Pandas has built-in function to test for missing values 
pd.isnull(np.NaN) # isnull test for missing values

True

In [6]:
pd.notnull(np.NaN) # notnull test for non-missing values 

False

# Loading data with missing values

In [7]:
# im the read_csv function, there are 3 parameters related to reading missing values
#1) na_values
# --> allows you to code additional missing values, for example na_values = [99]
# --> any cell with the value of 99 will be deemed as missing values

#2) keep_default_na
# --> is a Bool that allows you to specify whether any additional values need to be considered as missing
# --> this parameter is True by default
# --> related to number 1
# --> if this is False, this mean that only the values specified in na_values will be used to identify missing values

#3) na_filter
# --> is a Bool that willl specify whether any values will be read as missing
# --> this parameter is True by default, meaning that any missing values will be coded as NaN


In [8]:
visited_file = r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\survey_visited.csv"

In [9]:
print(pd.read_csv(visited_file))

ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [17]:
print(pd.read_csv(visited_file, keep_default_na = False))

ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3            
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [15]:
print(pd.read_csv(visited_file, na_values = [""], keep_default_na =  False))

ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [19]:
print(pd.read_csv(visited_file))

ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


# Merged Data

In [3]:
survey = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\c4\survey_survey.csv")
visited = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\c4\survey_visited.csv")

In [21]:
vs = visited.merge(survey, left_on = "ident", right_on ="taken")
vs["missing values"] = np.nan # user can input missing values using np.nan
vs

Unnamed: 0,ident,site,dated,taken,person,quant,reading,missing values
0,619,DR-1,1927-02-08,619,dyer,rad,9.82,
1,619,DR-1,1927-02-08,619,dyer,sal,0.13,
2,622,DR-1,1927-02-10,622,dyer,rad,7.8,
3,622,DR-1,1927-02-10,622,dyer,sal,0.09,
4,734,DR-3,1939-01-07,734,pb,rad,8.41,
5,734,DR-3,1939-01-07,734,lake,sal,0.05,
6,734,DR-3,1939-01-07,734,pb,temp,-21.5,
7,735,DR-3,1930-01-12,735,pb,rad,7.22,
8,735,DR-3,1930-01-12,735,,sal,0.06,
9,735,DR-3,1930-01-12,735,,temp,-26.0,


# Re-indexing by slicing 

In [22]:
gapminder = pd.read_excel(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\gapminder.xlsx")

In [23]:
gapminder.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [42]:
report = gapminder.groupby(["year"])["lifeExp","pop","gdpPercap"].mean()
report

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,16950400.0,3725.276046
1957,51.507401,18763410.0,4299.408345
1962,53.609249,20421010.0,4725.812342
1967,55.67829,22658300.0,5483.653047
1972,57.647386,25189980.0,6770.082815
1977,59.570157,27676380.0,7313.166421
1982,61.533197,30207300.0,7518.901673
1987,63.212613,33038570.0,7900.920218
1992,64.160338,35990920.0,8158.608521
1997,65.014676,38839470.0,9090.175363


In [47]:
report.index

Int64Index([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
            2007],
           dtype='int64', name='year')

In [50]:
# we can re-index by slicing the data
report1 = report.loc[range(2000,2010),:]
report1

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,,,
2001,,,
2002,65.694923,41457590.0,9917.848365
2003,,,
2004,,,
2005,,,
2006,,,
2007,67.007423,44021220.0,11680.07182
2008,,,
2009,,,


In [51]:
report1.index

Int64Index([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009], dtype='int64', name='year')

In [53]:
report2 = report[report.index > 2000]
report2

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2002,65.694923,41457590.0,9917.848365
2007,67.007423,44021220.0,11680.07182


In [58]:
# we can reindex using the reindex method
report2.reindex(range(2000,2010))

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,,,
2001,,,
2002,65.694923,41457590.0,9917.848365
2003,,,
2004,,,
2005,,,
2006,,,
2007,67.007423,44021220.0,11680.07182
2008,,,
2009,,,


# Find and count missing data

In [14]:
ebola = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\country_timeseries.csv")

In [15]:
ebola.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
Date                   122 non-null object
Day                    122 non-null int64
Cases_Guinea           93 non-null float64
Cases_Liberia          83 non-null float64
Cases_SierraLeone      87 non-null float64
Cases_Nigeria          38 non-null float64
Cases_Senegal          25 non-null float64
Cases_UnitedStates     18 non-null float64
Cases_Spain            16 non-null float64
Cases_Mali             12 non-null float64
Deaths_Guinea          92 non-null float64
Deaths_Liberia         81 non-null float64
Deaths_SierraLeone     87 non-null float64
Deaths_Nigeria         38 non-null float64
Deaths_Senegal         22 non-null float64
Deaths_UnitedStates    18 non-null float64
Deaths_Spain           16 non-null float64
Deaths_Mali            17 non-null float64
dtypes: float64(16), int64(1), object(1)
memory usage: 17.3+ KB


In [16]:
ebola.count() # this count the number of filled values (0 is not considered as missing values) 

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             17
dtype: int64

In [17]:
ebola.shape # this return a tuple with the number of row and column items

(122, 18)

In [18]:
number_of_items = ebola.shape[0]
number_of_items

122

In [19]:
ebola.size # this return the total number of possible items in the row * column matrix

2196

In [20]:
number_of_items - ebola.count() # this return the number of missing values in each column

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            105
dtype: int64

In [22]:
np.count_nonzero(ebola.isnull()) # this count the total number of missing values within the datasets

1209

In [23]:
np.count_nonzero(ebola["Deaths_Mali"].isnull()) # this count the number of missing values iwth

105

In [106]:
# we can use value_counts method on a series to return a frequency of values 

ebola.Cases_Guinea.value_counts(dropna=False).head()

NaN      29
86.0      3
495.0     2
112.0     2
390.0     2
Name: Cases_Guinea, dtype: int64

# DEALING WITH MISSING VALUES

In [26]:
ebola.head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,0.0
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,0.0
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,0.0
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,0.0
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,0.0


In [34]:
ebola.fillna(0)[["Date","Cases_Guinea"]].head(10)

Unnamed: 0,Date,Cases_Guinea
0,1/5/2015,2776.0
1,1/4/2015,2775.0
2,1/3/2015,2769.0
3,1/2/2015,0.0
4,12/31/2014,2730.0
5,12/28/2014,2706.0
6,12/27/2014,2695.0
7,12/24/2014,2630.0
8,12/21/2014,2597.0
9,12/20/2014,2571.0


In [36]:
ebola.fillna(method="bfill")[["Date","Cases_Guinea"]].head(10)

Unnamed: 0,Date,Cases_Guinea
0,1/5/2015,2776.0
1,1/4/2015,2775.0
2,1/3/2015,2769.0
3,1/2/2015,2730.0
4,12/31/2014,2730.0
5,12/28/2014,2706.0
6,12/27/2014,2695.0
7,12/24/2014,2630.0
8,12/21/2014,2597.0
9,12/20/2014,2571.0


In [39]:
ebola.fillna(method="bfill").tail(10)

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
112,4/4/2014,13,143.0,18.0,2.0,,,,,,86.0,7.0,2.0,,,,,
113,4/1/2014,10,127.0,8.0,2.0,,,,,,83.0,5.0,2.0,,,,,
114,3/31/2014,9,122.0,8.0,2.0,,,,,,80.0,4.0,2.0,,,,,
115,3/29/2014,7,112.0,7.0,2.0,,,,,,70.0,2.0,2.0,,,,,
116,3/28/2014,6,112.0,3.0,2.0,,,,,,70.0,3.0,2.0,,,,,
117,3/27/2014,5,103.0,8.0,6.0,,,,,,66.0,6.0,5.0,,,,,
118,3/26/2014,4,86.0,,,,,,,,62.0,,,,,,,
119,3/25/2014,3,86.0,,,,,,,,60.0,,,,,,,
120,3/24/2014,2,86.0,,,,,,,,59.0,,,,,,,
121,3/22/2014,0,49.0,,,,,,,,29.0,,,,,,,


In [37]:
ebola.interpolate()[["Date","Cases_Guinea"]].head()

Unnamed: 0,Date,Cases_Guinea
0,1/5/2015,2776.0
1,1/4/2015,2775.0
2,1/3/2015,2769.0
3,1/2/2015,2749.5
4,12/31/2014,2730.0


In [42]:
ebola.dropna()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
19,11/18/2014,241,2047.0,7082.0,6190.0,20.0,1.0,4.0,1.0,6.0,1214.0,2963.0,1267.0,8.0,0.0,1.0,0.0,6.0


In [43]:
ebola.dropna()

Unnamed: 0,Date,Day
0,1/5/2015,289
1,1/4/2015,288
2,1/3/2015,287
3,1/2/2015,286
4,12/31/2014,284
...,...,...
117,3/27/2014,5
118,3/26/2014,4
119,3/25/2014,3
120,3/24/2014,2


In [47]:
ebola["Cases_Guinea"].sum()

84729.0

In [75]:
ebola["calculated"] = ebola["Cases_Guinea"].fillna(0) + ebola["Cases_Liberia"].fillna(0) + ebola["Cases_SierraLeone"].fillna(0)

In [62]:
ebola["calculated"] = ebola[["Cases_Guinea", "Cases_Liberia", "Cases_SierraLeone"]].sum()

In [76]:
ebola[["Cases_Guinea", "Cases_Liberia", "Cases_SierraLeone","calculated"]]

Unnamed: 0,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,calculated
0,2776.0,0.0,10030.0,12806.0
1,2775.0,0.0,9780.0,12555.0
2,2769.0,8166.0,9722.0,20657.0
3,,8157.0,,8157.0
4,2730.0,8115.0,9633.0,20478.0
...,...,...,...,...
117,103.0,8.0,6.0,117.0
118,86.0,0.0,,86.0
119,86.0,0.0,,86.0
120,86.0,0.0,,86.0
