In [18]:
# AIG150 Winter 2024
# Week 4 Sample Code 
# Asma M Paracha
# Working with missing data

In [19]:
import pandas as pd
from numpy import nan
visited = pd.read_csv('survey_visited.csv')

In [20]:
# the data returned has missing values, loaded as NaN
print(visited)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [21]:
print(visited.iloc[5,2])

nan


In [22]:
print(visited.iloc[5,2] is nan)
print(pd.isnull(visited.iloc[5,2]))

True
True


In [23]:
# If you don't want to record anything for the missing vales
visited = pd.read_csv('survey_visited.csv',keep_default_na=False)
print(visited)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3            
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [24]:
# the data returned has missing values, loaded as NaN
visited = pd.read_csv('survey_visited.csv', na_values=[""],keep_default_na=False)
print(visited)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [25]:
# If you don't want to record the missing vales
visited = pd.read_csv('survey_visited.csv', na_filter=False)
print(visited)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3            
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [26]:
# missing values created at data merge
survey = pd.read_csv('survey_survey.csv')
visited = pd.read_csv('survey_visited.csv')
print(survey)
print(visited)

FileNotFoundError: [Errno 2] No such file or directory: 'survey_survey.csv'

In [16]:
# merge the 2 dataframe objects visinting in missing values
vs = visited.merge(survey, left_on='ident', right_on='taken')
print(vs)

    ident   site       dated  taken person quant  reading
0     619   DR-1  1927-02-08    619   dyer   rad     9.82
1     619   DR-1  1927-02-08    619   dyer   sal     0.13
2     622   DR-1  1927-02-10    622   dyer   rad     7.80
3     622   DR-1  1927-02-10    622   dyer   sal     0.09
4     734   DR-3  1939-01-07    734     pb   rad     8.41
5     734   DR-3  1939-01-07    734   lake   sal     0.05
6     734   DR-3  1939-01-07    734     pb  temp   -21.50
7     735   DR-3  1930-01-12    735     pb   rad     7.22
8     735   DR-3  1930-01-12    735    NaN   sal     0.06
9     735   DR-3  1930-01-12    735    NaN  temp   -26.00
10    751   DR-3  1930-02-26    751     pb   rad     4.35
11    751   DR-3  1930-02-26    751     pb  temp   -18.50
12    751   DR-3  1930-02-26    751   lake   sal     0.10
13    752   DR-3         NaN    752   lake   rad     2.19
14    752   DR-3         NaN    752   lake   sal     0.09
15    752   DR-3         NaN    752   lake  temp   -16.00
16    752   DR

In [4]:
# Missing values due to reindexing
gapminder = pd.read_csv('gapminder.tsv', sep='\t')
life_exp = gapminder.groupby(['year'])['lifeExp'].mean()
print(life_exp)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [None]:
# Take a subset of the data for the year 2000 and then re-index it 

In [5]:
# subset
y2000 = life_exp[life_exp.index > 2000]
print(y2000)

year
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [6]:
# reindex to see data from year 2000- 2010, will introduce a number of missing values
print(y2000.reindex(range(2000, 2010)))

year
2000          NaN
2001          NaN
2002    65.694923
2003          NaN
2004          NaN
2005          NaN
2006          NaN
2007    67.007423
2008          NaN
2009          NaN
Name: lifeExp, dtype: float64


In [19]:
# user input
# missing value in a series
from numpy import nan
num_legs = pd.Series({'goat': 4, 'amoeba': nan})
print(num_legs)

goat      4.0
amoeba    NaN
dtype: float64


In [8]:
# count the number of non-missing values
ebola = pd.read_csv('country_timeseries.csv')
print(ebola.count())

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64


In [8]:
# to get the number of missing values you can subtract non missing row count from the total number of rows
num_rows = ebola.shape[0]
num_missing = num_rows - ebola.count()
print(num_missing)

122
Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64


In [25]:
# Aonther way to count the number of missing values
import numpy as np
#print(ebola.isnull())

print(np.count_nonzero(ebola.isnull()))

1214


In [11]:
# missing value in a column
print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))

29


In [25]:
# Clean Missing Values
# fill the missing values to 0 and only look at the first 5 columns
print(ebola.fillna(0).iloc[:, 0:5])

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0            0.0            10030.0
1      1/4/2015  288        2775.0            0.0             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286           0.0         8157.0                0.0
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            0.0                0.0
119   3/25/2014    3          86.0            0.0                0.0
120   3/24/2014    2          86.0            0.0                0.0
121   3/22/2014    0          49.0            0.0                0.0

[122 rows x 5 columns]


In [9]:
# forward fill method used to fill data forward, the last known value is use to fill in missing data
# You can try bfill for backward fill 
print(ebola.fillna(method='ffill').iloc[:, 0:5])


# Check the interpolate() method also

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0         8166.0            10030.0
1      1/4/2015  288        2775.0         8166.0             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286        2730.0         8157.0             9633.0
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            NaN                NaN
119   3/25/2014    3          86.0            NaN                NaN
120   3/24/2014    2          86.0            NaN                NaN
121   3/22/2014    0          49.0            NaN                NaN

[122 rows x 5 columns]


In [27]:
# drop missing values
# Check the complete dropna argument list for "how" and its possible values
print(ebola.shape)
ebola_dropna = ebola.dropna()
print(ebola_dropna.shape)

(122, 18)
(1, 18)


In [28]:
print(ebola_dropna)

          Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone  \
19  11/18/2014  241        2047.0         7082.0             6190.0   

    Cases_Nigeria  Cases_Senegal  Cases_UnitedStates  Cases_Spain  Cases_Mali  \
19           20.0            1.0                 4.0          1.0         6.0   

    Deaths_Guinea  Deaths_Liberia  Deaths_SierraLeone  Deaths_Nigeria  \
19         1214.0          2963.0              1267.0             8.0   

    Deaths_Senegal  Deaths_UnitedStates  Deaths_Spain  Deaths_Mali  
19             0.0                  1.0           0.0          6.0  


In [None]:
# Calculations involving missing values results in nan
# The buildin sum and mean methods ignore NAN
# USe the skipna=TRUE in your calculations

In [27]:
dset=[(1,nan,2),
   (3,nan,nan)]
df1=pd.DataFrame(data=dset)
dset=[(4,5,6),
      (8,9,10)]
df2=pd.DataFrame(data=dset)
print(df1.mul(df2))



0    4.0
1    NaN
2    NaN
dtype: float64


In [None]:
print(df1.sum())