# Cleaning data for analysis
Converting data types

In [17]:
import pandas as pd
import numpy as np

In [10]:
tips = pd.read_csv('tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [11]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null object
smoker        244 non-null object
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.4+ KB


In [12]:
tips.sex = tips.sex.astype('category')
tips.smoker = tips.smoker.astype('category')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: category(2), float64(2), int64(1), object(2)
memory usage: 10.3+ KB


Working with numeric data

In [15]:
tips['total_bill'] = pd.to_numeric(tips['total_bill'], errors='coerce')
tips['tip'] = pd.to_numeric(tips['tip'], errors='coerce')

### Using functions to clean data
Custom functions to clean data  
- define function
- apply the function to the sex column

In [18]:
def recode_sex(sex_value):
    if sex_value == 'Male':
        return 1
    elif sex_value == 'Female':
        return 0
    else:
        return np.NaN

In [19]:
tips['sex_recode'] = tips.sex.apply(recode_sex)

In [20]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_recode
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,1
2,21.01,3.5,Male,No,Sun,Dinner,3,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1
4,24.59,3.61,Female,No,Sun,Dinner,4,0


### Lambda functions  
Prepare data

In [27]:
tips['total_dollar'] = '$' + tips.total_bill.astype(str)

In [28]:
import re

In [29]:
tips['total_dollar_replace'] = tips.total_dollar.apply(lambda x: x.replace('$', ''))

In [30]:
tips['total_dollar_re'] = tips.total_dollar.apply(lambda x: re.findall('\d+\.\d+', x)[0])

In [31]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_recode,total_dollar,total_dollar_replace,total_dollar_re
0,16.99,1.01,Female,No,Sun,Dinner,2,0,$16.99,16.99,16.99
1,10.34,1.66,Male,No,Sun,Dinner,3,1,$10.34,10.34,10.34
2,21.01,3.5,Male,No,Sun,Dinner,3,1,$21.01,21.01,21.01
3,23.68,3.31,Male,No,Sun,Dinner,2,1,$23.68,23.68,23.68
4,24.59,3.61,Female,No,Sun,Dinner,4,0,$24.59,24.59,24.59


### Duplicate and missing data
Dropping duplicate data

```python
tracks_no_dup = tracks.drop_duplicates()
```

Filling missing data
- calculate the mean of ozone column
- replace all missing with the mean

In [34]:
airquality = pd.read_csv('airquality.csv')

In [35]:
airquality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 6 columns):
Ozone      116 non-null float64
Solar.R    146 non-null float64
Wind       153 non-null float64
Temp       153 non-null int64
Month      153 non-null int64
Day        153 non-null int64
dtypes: float64(3), int64(3)
memory usage: 7.2 KB


In [36]:
oz_mean = airquality['Ozone'].mean()

In [37]:
airquality['Ozone'] = airquality['Ozone'].fillna(oz_mean)

In [38]:
airquality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 6 columns):
Ozone      153 non-null float64
Solar.R    146 non-null float64
Wind       153 non-null float64
Temp       153 non-null int64
Month      153 non-null int64
Day        153 non-null int64
dtypes: float64(3), int64(3)
memory usage: 7.2 KB


### Testing with asserts


In [40]:
ebola = pd.read_csv('ebola.csv')
ebola.head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,


In [43]:
pd.notnull(ebola)

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,True,True,True,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False
1,True,True,True,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False
2,True,True,True,True,True,False,False,False,False,False,True,True,True,False,False,False,False,False
3,True,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False
4,True,True,True,True,True,False,False,False,False,False,True,True,True,False,False,False,False,False
5,True,True,True,True,True,False,False,False,False,False,True,True,True,False,False,False,False,False
6,True,True,True,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False
7,True,True,True,True,True,False,False,False,False,False,False,True,True,False,False,False,False,False
8,True,True,True,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False
9,True,True,True,True,True,False,False,False,False,False,True,True,True,False,False,False,False,False


In [44]:
pd.notnull(ebola).all()

Date                    True
Day                     True
Cases_Guinea           False
Cases_Liberia          False
Cases_SierraLeone      False
Cases_Nigeria          False
Cases_Senegal          False
Cases_UnitedStates     False
Cases_Spain            False
Cases_Mali             False
Deaths_Guinea          False
Deaths_Liberia         False
Deaths_SierraLeone     False
Deaths_Nigeria         False
Deaths_Senegal         False
Deaths_UnitedStates    False
Deaths_Spain           False
Deaths_Mali            False
dtype: bool

In [45]:
pd.notnull(ebola).all().all()

False

In [48]:
# No errors only if True
assert pd.notnull(ebola).all().all()

AssertionError: 

In [49]:
assert (ebola >=0).all().all()

AssertionError: 