# Pandas Data Cleaning Pipeline for Eniac's raw data

## Import Libraries, Read Data and Basic functions for all files/dataframes
- start_pipeline: start the pipeline and make a copy of the dataframe
- drop_column: drops a specified column by column_name
- safe_df: finally save dataframe to a new specified csv file as basis for further explorations


In [1]:
import pandas as pd
import numpy as np

path = 'raw_data/'
orderlines = pd.read_csv(path + 'orderlines.csv')
orders = pd.read_csv(path + 'orders.csv')
products = pd.read_csv(path + 'products.csv')
brands = pd.read_csv(path + 'brands.csv')

def start_pipeline(data): 
    return data.copy()

def drop_column(data, column_name):
    return data.drop(column_name, axis = 1)

def save_df(data, filename):
    data.to_csv('raw_data/clean/' + filename, index = False)

## Functions for orders
- set_types_orders: set dtypes for the orders dataframe - change create_date to type datetime
- common_orders: check order_id/id_orders in orders and orderlines and write orders that are in both dataframes into a list
- select_common_orders: use the list of common_orders to filter the orders dataframe and return a new dataframe and some info on filtered rows

In [2]:
def set_types_orders(data): 
    return data.assign(created_date = pd.to_datetime(data['created_date']))

def common_orders(orders, orderlines): 
    return (orders
            .assign(order_check = orders['order_id'].isin(orderlines['id_order']))
            .query('order_check == True')
            ['order_id'].tolist())

def select_common_orders(data, orders_list: list): 
    if 'order_id' in data.columns: 
        data.rename(columns={'order_id':'id_order'}, inplace=True)
    data_filtered = data.query('id_order == @orders_list')
    print(
        f"""
        Total {data_filtered.shape[0]} rows included over the original {data.shape[0]} rows.\n
        Kept the {round((data_filtered.shape[0] / data.shape[0])*100, 2)}% of the data.
        """)
    return data_filtered    

In [3]:
orders_cl = (
orders
    .pipe(start_pipeline)
    .pipe(set_types_orders)
    .dropna()
    .pipe(select_common_orders, orders_list=common_orders(orders, orderlines))
)


        Total 204691 rows included over the original 226904 rows.

        Kept the 90.21% of the data.
        


In [4]:
orders_cl

Unnamed: 0,id_order,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled
...,...,...,...,...
226904,527397,2018-03-14 13:56:38,42.99,Place Order
226905,527398,2018-03-14 13:57:25,42.99,Shopping Basket
226906,527399,2018-03-14 13:57:34,141.58,Shopping Basket
226907,527400,2018-03-14 13:57:41,19.98,Shopping Basket


## Functions for orderlines
- set_types_orderlines: set dtpe datetime for column date
- clean_unit_price: cleans the column unit_price from . as thousand separetor and changes dytpe to float (numeric)


In [5]:
def set_types_orderlines(data):
    return data.assign(date = pd.to_datetime(data['date']))

def clean_unit_price(data):
    data[['price_num', 'price_dec']] = data.unit_price.str.rsplit('.', n=1, expand = True)
    data['price_num_nd'] = data['price_num'].str.replace('.', '', regex=True)
    data['unit_price_new'] = data['price_num_nd'] + '.' + data['price_dec']
    data['unit_price'] = pd.to_numeric(data['unit_price_new'])
    data.drop(['price_num', 'price_dec', 'unit_price_new', 'price_num_nd'], axis = 1, inplace = True)
    return data

def column_total_price(data):
    data['total_price'] = data['product_quantity'] * data['unit_price']
    return data
                

In [6]:
orderlines_cl = (
orderlines
    .pipe(start_pipeline)
    .pipe(set_types_orderlines)
    .pipe(clean_unit_price)
    .pipe(column_total_price)
    .pipe(drop_column, ['product_id'])
)

In [7]:
orderlines_cl.sample(5)

Unnamed: 0,id,id_order,product_quantity,sku,unit_price,date,total_price
144932,1403135,424684,4,MAT0007,78.26,2017-11-10 15:35:26,313.04
26425,1177501,322797,1,TUC0308,24.99,2017-02-12 09:34:03,24.99
173007,1455813,447229,1,WAC0054,27.19,2017-11-27 09:36:47,27.19
7113,1138108,306079,2,TRK0007,29.99,2017-01-09 21:37:20,59.98
29740,1183809,325542,1,GRT0429,16.99,2017-02-17 09:15:27,16.99


## Functions for products

- copy_desc: there are 5 missing values in the description column, as 4 of them are also "Completed" I want to keep them and just copied the name to the desc to fill it
- drop the duplicates with drop_duplicates function based only on 'sku' column - sku should be unique in the table
- drop_column: drop columns 'in_stock', 'type' and 'promo_price' for convenience as we don't need them and there are also missing values in the type column and promo_price is messed up
- clean_price: clean the price column, convert it to float and fix missing values
    - extract corrupted prices (those have 3 decimals and 1 or more dots, hence 2 or more parts when splittet by dots
    - replace corrupted prices by missing values and convert column to numeric (float)
- replace_missing: we already had 46 missing values in the price column at the beginning and now added some more during cleaning the price column
    - strategy for replacing missing values is to replace them by the maximum unit_price for this sku


In [15]:
def copy_desc(data):
    sku_list = data.loc[data['desc'].isna()].sku.to_list()
    for s in sku_list:
        data['desc'] = data['name']
    return data    

def clean_price(data):
    #replace all missing values in price column with 000.000 to not run into problems with str. methods
    data = data.assign(price = lambda x: x['price'].replace(np.nan, '000.000'))
    #split price on dot, count dots and add a new column for each processing step
    data = data.assign(price_split = lambda x: x['price'].str.split('\.'), dots_count = lambda x: x['price'].str.count('\.'))
    #add columns need_check and len_split to filter corrupted prices
    need_check = []
    len_split = []
    for val in data['price_split']:    
        len_split.append(len(val))
        if len(val[-1]) > 2: 
            need_check.append(True)
        else:
            need_check.append(False)
    data = data.assign(need_check = need_check, len_split = len_split)  
    #assign nan (missing values) to corrupted prices (three decimals and more than 1 item in splitted price and 1 dot or more)
    #and convert to numeric
    data = data.assign(
        price = lambda x: pd.to_numeric(
            np.where((x['need_check']==True) & (x['len_split'] > 1) & (x['dots_count'] != 0), np.nan, x['price'])))
    #drop columns that are not needed anymore
    data.drop(['price_split','need_check','len_split','dots_count'], axis=1, inplace=True)
    return data

def replace_missing(data):
    #save sku with missing price values to a list
    sku_na_price = data.loc[data['price'].isna(), 'sku'].tolist()
    #find sku on orderlines and aggregate max unit_price for this sku
    sku_ol_max_price = (
    orderlines_cl
        .query("sku ==@sku_na_price")
        .groupby('sku')
        .agg({'unit_price':'max'})
        .rename(columns={'unit_price':'max_price_ol'})
        .reset_index()
    )
    data = data.merge(sku_ol_max_price, how = 'left', on = 'sku')
    return data
   

In [16]:
products_cl= (
products
    .pipe(start_pipeline)
    .pipe(copy_desc)
    .drop_duplicates('sku')
    .pipe(clean_price)
    .pipe(replace_missing)
    .pipe(drop_column, ['in_stock', 'type', 'promo_price'])
)

In [17]:
products_cl.head()

Unnamed: 0,sku,name,desc,price,max_price_ol
0,RAI0007,Silver Rain Design mStand Support,Silver Rain Design mStand Support,59.99,
1,APP0023,Apple Mac Keyboard Keypad Spanish,Apple Mac Keyboard Keypad Spanish,59.0,
2,APP0025,Mighty Mouse Apple Mouse for Mac,Mighty Mouse Apple Mouse for Mac,59.0,
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,Apple Dock to USB Cable iPhone and iPod white,25.0,
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,34.99,


## Brands data is ok and does not need cleaning

In [11]:
brands_cl = brands
brands_cl.head()

Unnamed: 0,short,long
0,8MO,8Mobility
1,ACM,Acme
2,ADN,Adonit
3,AII,Aiino
4,AKI,Akitio


## Save all clean files

In [19]:
save_df(orders_cl, 'orders_cl.csv')
save_df(orderlines_cl, 'orderlines_cl.csv')
save_df(products_cl, 'products_cl.csv')
save_df(brands_cl, 'brands_cl.csv')

In [18]:
products_cl.iloc[3472,:]

sku                                                       PAC1653
name            QNAP TS-253A | 16GB | 20TB (2x10TB) Seagate Ir...
desc            QNAP TS-253A | 16GB | 20TB (2x10TB) Seagate Ir...
price                                                         NaN
max_price_ol                                              1399.99
Name: 3472, dtype: object