In [12]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Data Analysis with Python III - Validating and Cleaning Data

<!-- PELICAN_BEGIN_SUMMARY -->
A favorite old saying of mine with respect to data quality is "Garbage In and Garbage Out".
Your analysis is only ever as good as your dataset.

Pandas is a popular Python library used for data science and analysis. Used in conjunction with other data science toolsets like SciPy, NumPy, and Matplotlib,
a modeler can create end-to-end analytic workflows to solve business problems.

<br>My objective here is to go through some examples for how to perform basic data validation.
<!-- PELICAN_END_SUMMARY -->

**Goals: How to get a clean and valid dataset?**
<br>Sample file is a sales records with more than 300,000 rows
<br>I plan to demonstrate how to validate data using a "dirty" dataset.
<br>Specifically, how to 
- detect/eliminate outlier data
- deal with missing data
- add default values
- detect/eliminate duplicate data
- remove incomplete data/rows
- deal with error-prone columns
- normalize data types
- make bulk corrections to string/number values where appropriate
- change casing
- rename columns
- save clean data to CSV or Excel file

**Detect/eliminate outlier data**

In [13]:
## The file has 1M rows and 14 columns
## Checkout the basic structure of the data
df = pd.read_csv('data/analysis3/SalesRecords.csv')
df.head(4)
df.shape

## Look at the some basic stats for the ‘ItemType’ column
df.OrderPriority.describe()

## Select total profit greater than $1.8M
df[df['TotalProfit'] >1800000]

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995.0,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
1,,,,,,,,,,,,,,
2,940995585.0,Australia and Oceania,Papua New Guinea,Meat,,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0
3,940995585.0,Australia and Oceania,Papua New Guinea,Meat,,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0


(371707, 14)

count     371706
unique         4
top            C
freq       93091
Name: OrderPriority, dtype: object

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit


**Normalized data type**
- When reading in a CSV with a bunch of numbers, some of the numbers will read in as strings instead of numeric values, or vice versa

In [14]:
## This tells Pandas that the column ‘TotalProfit’ needs to be a float (real numbers and are written with a decimal point) value.
df = pd.read_csv('data/analysis3/SalesRecords.csv', dtype={'TotalProfit': float})                                                          
df.TotalProfit.describe()


## This tells Pandas that the column ‘OrderID’ needs to be a string and not a number

df = pd.read_csv('data/analysis3/SalesRecords.csv', dtype={'OrderID': str})
df.OrderID.describe()

count    3.717050e+05
mean     3.924652e+05
std      3.785990e+05
min      2.410000e+00
25%      9.544770e+04
50%      2.819860e+05
75%      5.657079e+05
max      1.738700e+06
Name: TotalProfit, dtype: float64

count        371706
unique       371699
top       198927056
freq              3
Name: OrderID, dtype: object

**Add in a default value for the missing data**

In [15]:
## Add in a default value for the missing data
## Detect missing data in Country columns and replaced "NaN" with "Missing"
df.Country = df.Country.fillna('Missing')
df.head(4)

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995.0,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
1,,,Missing,,,,,,,,,,,
2,940995585.0,Australia and Oceania,Papua New Guinea,Meat,,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0
3,940995585.0,Australia and Oceania,Papua New Guinea,Meat,,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0


**Delete the rows that have missing data**

In [16]:
## Drop all rows with any NA values
df.dropna().head(2)

## Drop rows that have all NA values
## Since we replace NA with "Missing", otherwise 2nd row should be dropped
df.dropna(how='all').head(2)

## Put a limitation on how many non-null values need to be in a row in order to keep it 
df.dropna(thresh=5).head(2)

## Origianal dataframe unchanged
df.shape

## Drop rows in SalesChannel column with NaN
df.dropna(subset=['SalesChannel']).head(2)

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
4,174590194,Europe,Slovakia,Beverages,Offline,L,10/26/2016,12/4/2016,3973.0,47.45,31.79,188518.85,126301.67,62217.18
6,425793445,Sub-Saharan Africa,Seychelles,Beverages,Online,M,1/18/2013,2/16/2013,597.0,47.45,31.79,28327.65,18978.63,9349.02


Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995.0,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
1,,,Missing,,,,,,,,,,,


Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
2,940995585,Australia and Oceania,Papua New Guinea,Meat,,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0


(371707, 14)

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
4,174590194,Europe,Slovakia,Beverages,Offline,L,10/26/2016,12/4/2016,3973.0,47.45,31.79,188518.85,126301.67,62217.18
5,174590194,Asia,Sri Lanka,,Online,L,11/7/2011,12/18/2011,1379.0,9.33,6.92,12866.07,9542.68,3323.39


**Delete the columns that have a high incidence of missing data**

In [17]:
## Drop the columns with that are all NA values
df.dropna(axis=1, how='all').head(2)

## Drop all columns with any NA values - "Country" is the only column without any NA values
df.dropna(axis=1, how='any').head(2)

Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
0,443368995.0,Sub-Saharan Africa,South Africa,,,M,7/27/2012,7/28/2012,1593.0,9.33,6.92,14862.69,11023.56,3839.13
1,,,Missing,,,,,,,,,,,


Unnamed: 0,Country
0,South Africa
1,Missing


**Detect/eliminate duplicate data**

In [18]:
## Delete duplicated data
df_dup=df[df.duplicated()]
print("There are %s duplicate order IDs" % (df_dup.shape[0]))  
df_dup.head()

There are 5 duplicate order IDs


Unnamed: 0,OrderID,Region,Country,ItemType,SalesChannel,OrderPriority,OrderDate,Ship Date,UnitsSold,UnitPrice,UnitCost,TotalRevenue,TotalCost,TotalProfit
3,940995585,Australia and Oceania,Papua New Guinea,Meat,,M,5/15/2015,6/4/2015,360.0,,364.69,151880.4,131288.4,20592.0
9,601245963,Sub-Saharan Africa,Ghana,Office Supplies,Online,L,3/23/2017,4/15/2017,896.0,651.21,524.96,583484.16,470364.16,113120.0
18,807785928,Sub-Saharan Africa,Ethiopia,Cosmetics,Online,M,7/7/2011,7/25/2011,662.0,437.2,263.33,289426.4,174324.46,115101.94
26,198927056,Asia,China,Office Supplies,Online,M,2/10/2016,3/29/2016,5791.0,651.21,524.96,3771157.11,3040043.36,731113.75
27,198927056,Asia,China,Office Supplies,Online,M,2/10/2016,3/29/2016,5791.0,651.21,524.96,3771157.11,3040043.36,731113.75


In [19]:
## Drop/delete duplicated data
df_dup_list=df_dup.drop_duplicates()

**Change column to uppercase and remove trailing whitespce**

In [20]:
##change all our movie titles to uppercase:
df['Region'].str.upper().tail(3)


##get rid of trailing whitespace
df['Country'].str.strip().tail(3)

371704         MIDDLE EAST AND NORTH AFRICA
371705    CENTRAL AMERICA AND THE CARIBBEAN
371706    CENTRAL AMERICA AND THE CARIBBEAN
Name: Region, dtype: object

371704     Algeria
371705    Dominica
371706      Panama
Name: Country, dtype: object

**Rename column**

In [21]:
df_rename = df.rename(columns = {'Region':'Continent', 'TotalProfit':'Net Income'})

In [11]:
# df_rename.to_csv('data/analysis3/SalesRecordsCleanRename.csv')