# Pandas Data Cleaning Pipeline for Eniac's raw data

## Import Libraries, Read Data and Basic functions for all files/dataframes
- start_pipeline: start the pipeline and make a copy of the dataframe


In [188]:
import pandas as pd

path = 'raw_data/'
orderlines = pd.read_csv(path + 'orderlines.csv')
orders = pd.read_csv(path + 'orders.csv')
products = pd.read_csv(path + 'products.csv')
brands = pd.read_csv(path + 'brands.csv')

def start_pipeline(data): 
    return data.copy()

def drop_column(data, column_name):
    return data.drop(column_name, axis = 1)

## Functions for orders
- set_types_orders: set dtypes for the orders dataframe - change create_date to type datetime
- common_orders: check order_id/id_orders in orders and orderlines and write orders that are in both dataframes into a list
- select_common_orders: use the list of common_orders to filter the orders dataframe and return a new dataframe and some info on filtered rows

In [124]:
def set_types_orders(data): 
    return data.assign(created_date = pd.to_datetime(data['created_date']))

def common_orders(orders, orderlines): 
    return (orders
            .assign(order_check = orders['order_id'].isin(orderlines['id_order']))
            .query('order_check == True')
            ['order_id'].tolist())

def select_common_orders(data, orders_list: list): 
    if 'order_id' in data.columns: 
        data.rename(columns={'order_id':'id_order'}, inplace=True)
    data_filtered = data.query('id_order == @orders_list')
    print(
        f"""
        Total {data_filtered.shape[0]} rows included over the original {data.shape[0]} rows.\n
        Kept the {round((data_filtered.shape[0] / data.shape[0])*100, 2)}% of the data.
        """)
    return data_filtered    

In [125]:
orders_filtered = (
orders
    .pipe(start_pipeline)
    .pipe(set_types_orders)
    .dropna()
    .pipe(select_common_orders, orders_list=common_orders(orders, orderlines))
)


        Total 204691 rows included over the original 226904 rows.

        Kept the 90.21% of the data.
        


In [126]:
orders_filtered

Unnamed: 0,id_order,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled
...,...,...,...,...
226904,527397,2018-03-14 13:56:38,42.99,Place Order
226905,527398,2018-03-14 13:57:25,42.99,Shopping Basket
226906,527399,2018-03-14 13:57:34,141.58,Shopping Basket
226907,527400,2018-03-14 13:57:41,19.98,Shopping Basket


## Functions for orderlines
- set_types_orderlines: set dtpe datetime for column date
- clean_unit_price: cleans the column unit_price from . as thousand separetor and changes dytpe to float (numeric)


In [98]:
def set_types_orderlines(data):
    return data.assign(date = pd.to_datetime(data['date']))

def clean_unit_price(data):
    data[['price_num', 'price_dec']] = data.unit_price.str.rsplit('.', n=1, expand = True)
    data['price_num_nd'] = data['price_num'].str.replace('.', '', regex=True)
    data['unit_price_new'] = data['price_num_nd'] + '.' + data['price_dec']
    data['unit_price'] = pd.to_numeric(data['unit_price_new'])
    data.drop(['price_num', 'price_dec', 'unit_price_new', 'price_num_nd'], axis = 1, inplace = True)
    return data
                

In [99]:
orderlines_filtered = (
orderlines
    .pipe(start_pipeline)
    .pipe(set_types_orderlines)
    .pipe(clean_unit_price)
    .pipe(drop_column, ['product_id'])
)

In [100]:
orderlines_filtered.head()

Unnamed: 0,id,id_order,product_quantity,sku,unit_price,date
0,1119109,299539,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,1,LGE0043,399.0,2017-01-01 00:19:45
2,1119111,299541,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,1,JBL0104,23.74,2017-01-01 01:06:38


## Remove Outliers from Orders and Orderlines based on price differences

- Uses filtered dataframes orderlines_filtered and orders_filtered so process those pipelines first

- Function filter_outliers: filter the orders with a price difference in total_price and total_paid that are not plausible and returns a list of orded ids that can then be removed from both orders and orderlines in the next step
    - Price difference are likely to come from shipping costs
    - Shipping costs e.g. in Spain for individuals with GLS are at most 105€ to the Balearics
    - Some of the worst outliers are in state "Shopping Basket" and not relevant as we are only interested in the "Completed" orders
    - Price differences between 0 and 105€ are plausible and would contain 98,7% of the completed orders
    - Therefore this range in price_differences is taken to remove outliers
    
- remove_outliers: filter dataframe based on ids in list and only keep those rows

In [170]:
#def filter_outliers(data_o, data_ol, diff_min=0, diff_max=105):
def filter_outliers(data_o, data_ol):
    data_ol['total_price'] = data_ol['unit_price'] * data_ol['product_quantity']
    price_info = (
    data_ol
        .groupby('id_order')
        .agg({'total_price':'sum'})
        .merge(data_o, how='inner', on='id_order')
        .copy()
    )
    price_info= (
    price_info
        .assign(price_difference = round(price_info['total_paid'] - price_info['total_price'], 2)) 
    )
    ids = (
    price_info
        .query('price_difference >= 0 & price_difference <= 105')
        ['id_order'].tolist()
    )
    return ids

def remove_outliers(data, id_list):
    return data.query('id_order == @id_list')

In [171]:
orders_clean = (
orders_filtered
    .pipe(start_pipeline)
    #.pipe(filter_outliers, orderlines_filtered)
    .pipe(remove_outliers, id_list = filter_outliers(orders_filtered, orderlines_filtered))
)

In [172]:
orderlines_clean = (
orderlines_filtered
    .pipe(start_pipeline)
    #.pipe(filter_outliers, orderlines_filtered)
    .pipe(remove_outliers, id_list = filter_outliers(orders_filtered, orderlines_filtered))
)

Following cells are just to check if outliers were removed

In [173]:
orders_clean.head()

Unnamed: 0,id_order,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled


In [174]:
orderlines_clean.head()

Unnamed: 0,id,id_order,product_quantity,sku,unit_price,date,total_price
0,1119109,299539,1,OTT0133,18.99,2017-01-01 00:07:19,18.99
1,1119110,299540,1,LGE0043,399.0,2017-01-01 00:19:45,399.0
2,1119111,299541,1,PAR0071,474.05,2017-01-01 00:20:57,474.05
3,1119112,299542,1,WDT0315,68.39,2017-01-01 00:51:40,68.39
4,1119113,299543,1,JBL0104,23.74,2017-01-01 01:06:38,23.74


In [177]:
orderlines_clean['id_order'].isin(filter_outliers(orders_filtered, orderlines_filtered)).sum()

288102

In [179]:
orderlines_clean.shape

(288102, 7)

In [180]:
orders_clean['id_order'].isin(filter_outliers(orders_filtered, orderlines_filtered)).sum()

202574

In [181]:
orders_clean.shape

(202574, 4)

## Functions for products

- copy_desc: there are 5 missing values in the description column, as 4 of them are also "Completed" I want to keep them and just copied the name to the desc to fill it
- drop the duplicates with drop_duplicates function
- drop_column: drop columns 'in_stock' and 'type' for convenience as we don't need them and there are also missing values in the type column
- drop missing values from price
    - there are only 46 missing values, that is really few
    - 43 of them are from one company - "Celly" (selling mainly iphone cases and covers) - there are no more products from this company (sku with CEL)
    - so I decided to drop them because there is probably some problem with this company
    - the other 3 nan prices are just neglectable
    
- clean_price: clean the price column from dots as thousand operators, calculate correct prices for some wrong prices (the ones with 3 decimals after the last dot - it seems that dividing them by 100 could fix it) and convert the column to float


In [210]:
def copy_desc(data):
    sku_list = data.loc[data['desc'].isna()].sku.to_list()
    for s in sku_list:
        data['desc'] = data['name']
    return data    

def clean_price(data):
    #first add decimals for all prices that don't have decimals
    data['price_new'] = data.price.apply(lambda x: x + '.00' if x.count('.') == 0 else x )
    #second rsplit decimals to be able to filter prices with 3 decimals, the others seem to be ok
    data[['price_num', 'price_dec']]=data.price_new.str.rsplit(pat='.', n=1, expand=True)
    #for prices with 3 decimals 
    products_prices = data.merge(orderlines_clean, on='sku', how='left')
    products_wrong = products_prices.loc[products_prices['price_dec'].str.len()==3]
    return products_wrong

In [211]:
products_filtered = (
products
    .pipe(start_pipeline)
    .pipe(copy_desc)
    .drop_duplicates()
    .pipe(drop_column, ['in_stock', 'type'])
    .dropna()
    .pipe(clean_price)
)

In [212]:
products_filtered.sample(5)

Unnamed: 0,sku,name,desc,price,promo_price,price_new,price_num,price_dec,id,id_order,product_quantity,unit_price,date,total_price
256868,APP2490,Apple iPhone 64GB X Silver,Apple iPhone 64GB X Silver,115.900.092,11.590.009,115.900.092,115.9,92,1508215.0,468193.0,1.0,1159.0,2017-12-20 18:23:20,1159.0
258017,APP2491,Apple iPhone X 256GB Space Gray,Apple iPhone X 256GB Space Gray,13.290.011,13.290.011,13.290.011,13.29,11,1639058.0,522738.0,1.0,1329.0,2018-03-07 13:31:50,1329.0
197004,REP0307,Load Connector Repair iPhone 6 Plus,Load Connector Repair iPhone 6 Plus,599.906,599.906,599.906,599.0,906,1244744.0,354298.0,1.0,59.99,2017-05-09 19:32:41,59.99
257112,APP2490,Apple iPhone 64GB X Silver,Apple iPhone 64GB X Silver,115.900.092,11.590.009,115.900.092,115.9,92,1531994.0,479414.0,1.0,1159.0,2018-01-02 11:43:40,1159.0
256676,APP2494,Apple TV 4K 64GB,Apple TV 4K 64GB,21.900.032,2.190.003,21.900.032,21.9,32,1603562.0,507246.0,1.0,219.0,2018-02-07 07:50:19,219.0


## Brands data is ok and does not need cleaning

In [104]:
brands.head()

Unnamed: 0,short,long
0,8MO,8Mobility
1,ACM,Acme
2,ADN,Adonit
3,AII,Aiino
4,AKI,Akitio
