In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Data Analysis with Python III - Validating and Cleaning Data

<!-- PELICAN_BEGIN_SUMMARY -->
A favorite old saying of mine with respect to data quality is "Garbage In and Garbage Out".
Your analysis is only ever as good as your dataset.

Pandas is a popular Python library used for data science and analysis. Used in conjunction with other data science toolsets like SciPy, NumPy, and Matplotlib,
a modeler can create end-to-end analytic workflows to solve business problems.

<br>My objective here is to go through some examples for how to perform basic data validation.
<!-- PELICAN_END_SUMMARY -->

**Goals: How to get a clean and valid dataset?**
<br>Sample file is a sales records with more than 1M rows
<br>I plan to demonstrate how to validate data using a "dirty" dataset.
<br>Specifically, how to 
- detect/eliminate outlier data
- deal with missing data
- add default values
- detect/eliminate duplicate data
- remove incomplete data/rows
- deal with error-prone columns
- normalize data types
- make bulk corrections to string/number values where appropriate
- change casing
- rename columns
- save clean data to CSV or Excel file

**Detect/eliminate outlier data**

In [2]:
## The file has 1M rows and 14 columns
## Checkout the basic structure of the data
df = pd.read_csv('data/analysis3/SalesRecords.csv')
df.head(4)
df.shape

## Look at the some basic stats for the ‘ItemType’ column
df.OrderPriority.describe()

## Select total profit greater than $1.8M
df[df['TotalProfit'] >1800000]

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995.0,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
1,,,,,,,,,,,,,,
2,940995585.0,Australia and Oceania,Papua New Guinea,Meat,,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0
3,940995585.0,Australia and Oceania,Papua New Guinea,Meat,Offline,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0


(1048575, 14)

count     1048574
unique          4
top             C
freq       262366
Name: OrderPriority, dtype: object

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
1045918,928084933.0,Europe,Cyprus,Cosmetics,Online,L,3/4/2013,3/8/2013,9811.0,437.2,263.33,4289369.2,2583530.63,1855838.57


**Normalized data type**
- When reading in a CSV with a bunch of numbers, some of the numbers will read in as strings instead of numeric values, or vice versa

In [16]:
## This tells Pandas that the column ‘TotalProfit’ needs to be a float (real numbers and are written with a decimal point) value.
df = pd.read_csv('data/analysis3/SalesRecords.csv', dtype={'TotalProfit': float})                                                          
df.TotalProfit.describe()


## This tells Pandas that the column ‘OrderID’ needs to be a string and not a number

df = pd.read_csv('data/analysis3/SalesRecords.csv', dtype={'OrderID': str})
df.OrderID.describe()

count    1.048572e+06
mean     3.923310e+05
std      3.788092e+05
min      2.410000e+00
25%      9.515282e+04
50%      2.811283e+05
75%      5.653050e+05
max      1.855839e+06
Name: TotalProfit, dtype: float64

count       1048573
unique       948568
top       910656154
freq              2
Name: OrderID, dtype: object

**Add in a default value for the missing data**

In [5]:
## Add in a default value for the missing data
## Detect missing data in Country columns and replaced "NaN" with "Missing"
df.Country = df.Country.fillna('Missing')
df.head(4)

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995.0,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
1,,,Missing,,,,,,,,,,,
2,940995585.0,Australia and Oceania,Papua New Guinea,Meat,,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0
3,940995585.0,Australia and Oceania,Papua New Guinea,Meat,Offline,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0


**Delete the rows that have missing data**

In [6]:
## Drop all rows with any NA values
df.dropna().head(2)

## Drop rows that have all NA values
## Since we replace NA with "Missing", otherwise 2nd row should be dropped
df.dropna(how='all').head(2)

## Put a limitation on how many non-null values need to be in a row in order to keep it 
df.dropna(thresh=5).head(2)

## Origianal dataframe unchanged
df.shape

## Drop rows in SalesChannel column with NaN
df.dropna(subset=['SalesChannel']).head(2)

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
4,174590194.0,Europe,Slovakia,Beverages,Offline,L,10/26/2016,12/4/2016,3973.0,47.45,31.79,188518.85,126301.67,62217.18
6,425793445.0,Sub-Saharan Africa,Seychelles,Beverages,Online,M,1/18/2013,2/16/2013,597.0,47.45,31.79,28327.65,18978.63,9349.02


Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995.0,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
1,,,Missing,,,,,,,,,,,


Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995.0,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
2,940995585.0,Australia and Oceania,Papua New Guinea,Meat,,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0


(1048575, 14)

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
3,940995585.0,Australia and Oceania,Papua New Guinea,Meat,Offline,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0
4,174590194.0,Europe,Slovakia,Beverages,Offline,L,10/26/2016,12/4/2016,3973.0,47.45,31.79,188518.85,126301.67,62217.18


**Delete the columns that have a high incidence of missing data**

In [7]:
## Drop the columns with that are all NA values
df.dropna(axis=1, how='all').head(2)

## Drop all columns with any NA values - "Country" is the only column without any NA values
df.dropna(axis=1, how='any').head(2)

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995.0,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
1,,,Missing,,,,,,,,,,,


Unnamed: 0,Country
0,South Africa
1,Missing


**Detect/eliminate duplicate data**

In [8]:
## Delete duplicated data

df_dupes=df[df.duplicated()]
print("There are %s duplicate order IDs" % (df_dupes.shape[0]))  
df_dupes.head()

There are 50010 duplicate order IDs


Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
9,601245963.0,Sub-Saharan Africa,Ghana,Office Supplies,Online,L,3/23/2017,4/15/2017,896.0,651.21,524.96,583484.16,470364.16,113120.0
747904,370419776.0,Central America and the Caribbean,Trinidad and Tobago,Personal Care,Offline,M,4/20/2015,5/4/2015,4298.0,81.73,56.67,351275.54,243567.66,107707.88
747907,667237317.0,Sub-Saharan Africa,Djibouti,Clothes,Offline,H,5/27/2018,6/27/2018,9204.0,109.28,35.84,1005813.12,329871.36,675941.76
747908,214140975.0,Sub-Saharan Africa,Tanzania,Cosmetics,Online,L,9/16/2016,10/19/2016,7928.0,437.2,263.33,3466121.6,2087680.24,1378441.36
747909,222868669.0,Australia and Oceania,Australia,Household,Offline,L,1/12/2011,2/23/2011,647.0,668.27,502.54,432370.69,325143.38,107227.31


In [9]:
## Drop duplicated data
df_clean=df_dupes.drop_duplicates()

df_clean.to_csv('data/analysis3/SalesRecordsClean.csv')

In [10]:
## The new clean dataset without duplicated ID is with 50010 rows
df = pd.read_csv('data/analysis3/SalesRecordsClean.csv')
df.shape

(50010, 15)

**Change column to uppercase and remove trailing whitespce**

In [135]:
##change all our movie titles to uppercase:
df['Region'].str.upper().tail(3)


##get rid of trailing whitespace
df['Country'].str.strip().tail(3)

1048572    SUB-SAHARAN AFRICA
1048573                EUROPE
1048574                EUROPE
Name: Region, dtype: object

1048572    South Africa
1048573         Denmark
1048574      San Marino
Name: Country, dtype: object

**Rename column**

In [139]:
df_rename = df.rename(columns = {'Region':'Continent', 'TotalProfit':'Net Income'})

In [143]:
df_rename.to_csv('data/analysis3/SalesRecordsRename.csv')