### While exploring the data in the previous notebook ( see extract-explore-data.ipynb) the following was identified:

- 5270 rows of duplicated data - remove duplicated rows and keep just one instance
- missing customer id
- invoice_date field has a wrong data type
- 22 stock codes without customer_id do not have a descriptions # may need further investigation
- 10k invoices with a negative quantity - may be considered to remove this rows
- invoices with negative price and missing customer id
- 47 examples of the description having a ?, which will be replace with Unknown

In [1]:
# import libraries

import pandas as pd
import psycopg2

import boto3
from io import StringIO

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Connect to Redshift

dbname = 
host = 
port = 
user = 
password = 

connect = psycopg2.connect(dbname=dbname, host=host, port=port, user=user, password=password)
cursor = connect.cursor()

In [3]:
# Write a query that does the following: 
# - Select everything from online_transaction table and description from stock description table. 
# - Filters on where customer_id is not equal to ‘’ 
# - Filters on where stock_code not in BANK CHARGES, POSTAGE, D, M, CRUK 
# - Filters invoices that have a negative quantity 
# - If the description is ? or is null replaces it with Unknown

query = '''select o.*,
                  case when s.description = '?' or s.description is null then 'Unknown' else s.description end as description
            from bootcamp1.online_transactions o
            left join bootcamp1.stock_description s 
            on o.stock_code = s.stock_code
            where o.customer_id <> ''
            and o.quantity > 0
            and o.stock_code not in ('BANK CHARGES', 'POSTAGE', 'D', 'M', 'CRUK')
           '''

In [4]:
# Save the query as dataframe
online_trans_clean = pd.read_sql(query, connect)

In [5]:
# Check the dataframe
online_trans_clean

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536365,84029G,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom,KNITTED UNION FLAG HOT WATER BOTTLE
1,536366,22633,6,2010-12-01 08:28:00,1.85,u1785,United Kingdom,HAND WARMER UNION JACK
2,536368,22912,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,YELLOW COAT RACK PARIS FASHION
3,536367,22748,6,2010-12-01 08:34:00,2.10,u13047,United Kingdom,POPPY'S PLAYHOUSE KITCHEN
4,536367,22623,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,BOX OF VINTAGE JIGSAW BLOCKS
...,...,...,...,...,...,...,...,...
405130,581586,21217,24,2011-12-09 12:49:00,8.95,u13113,United Kingdom,RED RETROSPOT ROUND CAKE TINS
405131,581587,22555,12,2011-12-09 12:50:00,1.65,u1268,France,PLASTERS IN TIN STRONGMAN
405132,581587,22730,4,2011-12-09 12:50:00,3.75,u1268,France,ALARM CLOCK BAKELIKE IVORY
405133,581587,22613,12,2011-12-09 12:50:00,0.85,u1268,France,PACK OF 20 SPACEBOY NAPKINS


In [7]:
# Check for duplicates

online_trans_clean.duplicated().sum()

5268

In [8]:
# Variant #1
# Drop duplicated rows and keep the first appearence of a duplicated rows

online_trans_clean = online_trans_clean.drop_duplicates(keep = 'first')

In [9]:
# Check for duplicates #2

online_trans_clean.duplicated().sum()

0

In [17]:
# Variant # 2
# A definition can be also used to drop the duplicated data

def removing_duplicate_data(df):
    if df.duplicated().sum() > 0: 
        print('Number of duplicates found is ', df.duplicated().sum())
        df_clean = df.drop_duplicates(keep = 'first')
        return df_clean
    else :
        print('No duplicates found')

In [18]:
online_trans_clean = removing_duplicate_data(online_trans_clean)

No duplicates found


In [10]:
# Check the data types

online_trans_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399867 entries, 0 to 405134
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   invoice       399867 non-null  object 
 1   stock_code    399867 non-null  object 
 2   quantity      399867 non-null  int64  
 3   invoice_date  399867 non-null  object 
 4   price         399867 non-null  float64
 5   customer_id   399867 non-null  object 
 6   country       399867 non-null  object 
 7   description   399867 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 27.5+ MB


In [11]:
# Fix the invoice_date type to date format

online_trans_clean.invoice_date = pd.to_datetime(online_trans_clean.invoice_date, format = '%Y-%m-%d')

In [12]:
# Check the data types #2

online_trans_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399867 entries, 0 to 405134
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   invoice       399867 non-null  object        
 1   stock_code    399867 non-null  object        
 2   quantity      399867 non-null  int64         
 3   invoice_date  399867 non-null  datetime64[ns]
 4   price         399867 non-null  float64       
 5   customer_id   399867 non-null  object        
 6   country       399867 non-null  object        
 7   description   399867 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 27.5+ MB


In [14]:
online_trans_clean.description.describe()

count      399867
unique       3628
top       Unknown
freq         8585
Name: description, dtype: object

In [16]:
online_trans_clean.stock_code.info()

<class 'pandas.core.series.Series'>
Int64Index: 399867 entries, 0 to 405134
Series name: stock_code
Non-Null Count   Dtype 
--------------   ----- 
399867 non-null  object
dtypes: object(1)
memory usage: 6.1+ MB
