# Cleaning Eniac data - orders

In [1]:
import pandas as pd
from functools import wraps
import datetime as dt

In [2]:
orderlines = pd.read_csv('data/eniac/orderlines_clean.csv')
products = pd.read_csv('data/eniac/products.csv')
orders = pd.read_csv('data/eniac/orders_without_outliers.csv')

In [3]:
# Function that makes logs in the pipeline
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper

In [4]:
# Function that returns copy of DataFrame
@log_step
def start_pipeline(df):
    return df.copy()

## Orders info

In [5]:
#Function that returns number of missing values
def check_missing_values(data): 
    print('Missing values:' + '\n' + str(data.isna().sum()))

#Function that returns number of duplicated rows
def check_duplicates(data): 
    print('Duplicated rows: ', data.duplicated().sum())

def check_table(df):
    check_missing_values(df)
    check_duplicates(df)
    df.info()

In [6]:
check_table(orders)

Missing values:
Unnamed: 0      0
order_id        0
created_date    0
total_paid      5
state           0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226429 entries, 0 to 226428
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    226429 non-null  int64  
 1   order_id      226429 non-null  int64  
 2   created_date  226429 non-null  object 
 3   total_paid    226424 non-null  float64
 4   state         226429 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 8.6+ MB


### Changing types 

I have to change type of 'created_date' (to DateTime)

#### change 'created_date' to DateTime

In [7]:
#function that converts DataSeries type to dataFrame
@log_step
def toDateTime(df,column):
    df[column] = pd.to_datetime(df[column])
    return df

In [8]:
col_to_datetime = 'created_date'

### Dealing with NaN values

In [9]:
orders[orders.total_paid.isna()]

Unnamed: 0.1,Unnamed: 0,order_id,created_date,total_paid,state
127410,127701,427314,2017-11-20 18:54:39,,Pending
131720,132013,431655,2017-11-22 12:15:24,,Pending
147014,147316,447411,2017-11-27 10:32:37,,Pending
148529,148833,448966,2017-11-27 18:54:15,,Pending
149130,149434,449596,2017-11-27 21:52:08,,Pending


Looks like 5 missing values are for orders that are in state of pending so I have decided to drop them

In [10]:
@log_step
#Function that returns rows with notNaN values in 'column'
def select_notNan_rows(df,columns):
    for col in columns:
        df = df[df[col].notna()]
    return df

In [11]:
columns_where_NaNs_are = ['total_paid']

## Dealing with 'ghosts'

Check how many orders from orders don't exist in orderlines by order_id (22213)

In [12]:
# (p.
#     assign(check_orders = p.order_id.isin(orders.order_id)).
#     query("check_orders==False")
# )

In [13]:
@log_step
def delete_orders_not_in_orderlines(df):
    df = (df.
            assign(check_orders = df.order_id.isin(orders.order_id)).
            query("check_orders==True")
            )
    return df.drop('check_orders',axis=1)

In [14]:
(orders.
     assign(check_orders = orders.order_id.isin(orderlines.order_id)).
     query("check_orders==False")
     .state.value_counts()
)

Place Order        12304
Shopping Basket     9810
Completed             45
Cancelled             41
Pending               18
Name: state, dtype: int64

There are many of those. Expand with ...

## Pipeline

In [16]:
p = (orders
     .pipe(start_pipeline)
     .pipe(toDateTime,col_to_datetime)
     .pipe(select_notNan_rows,columns_where_NaNs_are)
)

just ran step start_pipeline shape=(226429, 5) took 0:00:00.004987s
just ran step toDateTime shape=(226429, 5) took 0:00:00.117716s
just ran step select_notNan_rows shape=(226424, 5) took 0:00:00.009015s


# Creating new clean csv

In [21]:
p.to_csv('data/eniac/orders_clean.csv')

In [17]:
check_table(p)

Missing values:
Unnamed: 0      0
order_id        0
created_date    0
total_paid      0
state           0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 226424 entries, 0 to 226428
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    226424 non-null  int64         
 1   order_id      226424 non-null  int64         
 2   created_date  226424 non-null  datetime64[ns]
 3   total_paid    226424 non-null  float64       
 4   state         226424 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 10.4+ MB


In [20]:
p.head()

Unnamed: 0.1,Unnamed: 0,order_id,created_date,total_paid,state
0,0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,1,241423,2017-11-06 13:10:02,136.15,Completed
2,2,242832,2017-12-31 17:40:03,15.76,Completed
3,3,243330,2017-02-16 10:59:38,84.98,Completed
4,4,243784,2017-11-24 13:35:19,157.86,Cancelled
