In [None]:
import pandas as pd
import numpy as np

path = 'raw_data/clean'
orderlines = pd.read_csv(path + 'orderlines_cl.csv')
orders = pd.read_csv(path + 'orders_cl.csv')
products = pd.read_csv(path + 'products_cl.csv')
brands = pd.read_csv(path + 'brands_cl.csv')

def start_pipeline(data): 
    return data.copy()

def drop_column(data, column_name):
    return data.drop(column_name, axis = 1)

## Remove Outliers from Orders and Orderlines based on price differences

- Uses filtered dataframes orderlines_filtered and orders_filtered so process those pipelines first

- Function filter_outliers: filter the orders with a price difference in total_price and total_paid that are not plausible and returns a list of orded ids that can then be removed from both orders and orderlines in the next step
    - Price difference are likely to come from shipping costs
    - Shipping costs e.g. in Spain for individuals with GLS are at most 105€ to the Balearics
    - Some of the worst outliers are in state "Shopping Basket" and not relevant as we are only interested in the "Completed" orders
    - Price differences between 0 and 105€ are plausible and would contain 98,7% of the completed orders
    - Having a look at the info() (distribution) of the price difference 25% (Q1) of them are between 0 and -0.01 probably because of some problems when substracting the prices and rounding it - so it would be good to include those because they only have not matching prices because of the calculation within python
    - outliers range from -0.1 to 105€
    
- remove_outliers: filter dataframe based on ids in list and only keep those rows
- save final dataframes as new cleaned csv files

In [None]:
#def filter_outliers(data_o, data_ol, diff_min=0, diff_max=105):
def filter_outliers(data_o, data_ol):
    data_ol['total_price'] = data_ol['unit_price'] * data_ol['product_quantity']
    price_info = (
    data_ol
        .groupby('id_order')
        .agg({'total_price':'sum'})
        .merge(data_o, how='inner', on='id_order')
        .copy()
    )
    price_info= (
    price_info
        .assign(price_difference = round(price_info['total_paid'] - price_info['total_price'], 2)) 
    )
    ids = (
    price_info
        .query('price_difference >= 0 & price_difference <= 105')
        ['id_order'].tolist()
    )
    return ids

def remove_outliers(data, id_list):
    return data.query('id_order == @id_list')

In [None]:
orders_clean = (
orders
    .pipe(start_pipeline)
    .pipe(remove_outliers, id_list = filter_outliers(orders, orderlines))
)

In [None]:
orderlines_clean = (
orderlines
    .pipe(start_pipeline)
    .pipe(remove_outliers, id_list = filter_outliers(orders, orderlines))
)