# Eniac data cleaning

In [1]:
import pandas as pd
from functools import wraps
import datetime as dt

In [2]:
orderlines = pd.read_csv('data/eniac/orderlines.csv')
products = pd.read_csv('data/eniac/products.csv')
orders = pd.read_csv('data/eniac/orders.csv')

In [3]:
# Function that makes logs in the pipeline
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper

In [4]:
# Function that returns copy of DataFrame
@log_step
def start_pipeline(df):
    return df.copy()

## Products info

In [5]:
#Function that returns number of missing values
def check_missing_values(data): 
    print('Missing values:' + '\n' + str(data.isna().sum()))

#Function that returns number of duplicated rows
def check_duplicates(data): 
    print('Duplicated rows: ', data.duplicated().sum())

def check_table(df):
    check_missing_values(df)
    check_duplicates(df)
    df.info()

In [6]:
check_table(products)

Missing values:
sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64
Duplicated rows:  8746
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


### Dropping columns

I have decided that for my future analisys I don't need columns 'in_stock' and 'type'. I am passing list of colums as an argument

In [7]:
# Function that drops columns from a list of columns
@log_step
def drop_columns(df, columns):
    return df.drop(columns,axis=1)

In [8]:
columns_to_drop = ['in_stock','type']

### Dealing with NaN values

I have decided to exclude all rows that have NaN value in 'price' and 'desc' column since there are very few of them. I have forwarded function called select_nonNan_rows to pipeline.

In [9]:
@log_step
#Function that returns rows with notNaN values in 'column'
def select_notNan_rows(df,columns):
    for col in columns:
        df = df[df[col].notna()]
    return df

In [10]:
columns_where_NaNs_are = ['price','desc']

### Losing duplicates

I have decided to lose all rows that have duplicate 'sku' value.

In [11]:
@log_step
#Function that drops duplicates
def drop_duplicates_custom(df,columns=[]):
    if(len(columns) > 0):
        return df.drop_duplicates(columns)
    else:
        return df.drop_duplicates()

In [12]:
columns_arg_for_duplicates_drop = ['sku']

### Changing types 

I have to change types of 'price' and 'promo_price' from object to float

It seems like price and promo price had some problems with decimals, decimal spaces...so I have created clean_prices function that clean those. After that I was able to change types to float.

In [13]:
#Function for cleaning price and promo_price columns
@log_step
def clean_prices(p,columns):
    for col in columns:
        p[col] = p[col].apply(lambda x : x +'.00' if x.count('.') == 0 else x)
        p[col] = p[col].apply(lambda x: x  + '0' if x[-2]=='.' else x)
        p[col] = p[col].apply(lambda x: str(float(x.replace('.',''))/1000) if ( (x[-4]=='.') & (x.count('.')==2)) else x)
        p[col] = p[col].apply(lambda x: str(float(x.replace('.',''))/10000) if ( (x[-4]=='.') & (x.count('.')==1)) else x)
        p[col] = p[col].astype(float)
        p[col].round(decimals = '2')
    return p

## Pipeline

In [14]:
p = (products
     .pipe(start_pipeline)
     .pipe(drop_columns,columns_to_drop)
     .pipe(select_notNan_rows,columns_where_NaNs_are)
     .pipe(drop_duplicates_custom,columns_arg_for_duplicates_drop)
     .pipe(clean_prices,['price','promo_price'])
)

just ran step start_pipeline shape=(19326, 7) took 0:00:00.001996s
just ran step drop_columns shape=(19326, 5) took 0:00:00.001995s
just ran step select_notNan_rows shape=(19273, 5) took 0:00:00.007982s
just ran step drop_duplicates_custom shape=(10527, 5) took 0:00:00.004983s
just ran step clean_prices shape=(10527, 5) took 0:00:00.052858s


In [15]:
check_table(p)

Missing values:
sku            0
name           0
desc           0
price          0
promo_price    0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10527 entries, 0 to 19325
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sku          10527 non-null  object 
 1   name         10527 non-null  object 
 2   desc         10527 non-null  object 
 3   price        10527 non-null  float64
 4   promo_price  10527 non-null  float64
dtypes: float64(2), object(3)
memory usage: 493.5+ KB


In [16]:
p.sample(20)

Unnamed: 0,sku,name,desc,price,promo_price
772,APP0647,Apple Smart Case iPad Air Case Black,Smart leather case for your iPad Air.,89.0,79.9931
388,REP0073,Repair Touch Screen iPad (1st generation),Repair service including parts and labor for iPad,99.99,99.9896
17107,OWC0227,OWC Mercury Helios PCIe Thunderbolt Box 3,Thunderbolt expansion box 3 for half-length PC...,392.97,289.9898
17872,PAC2363,Synology DS218 + NAS Server | 2GB RAM | 12TB (...,NAS storage server integrated with special foc...,904.97,713.1788
16230,PAC2143,"Apple iMac 27 ""Core i5 3.8GHz Retina 5K | 16GB...",IMac desktop computer 27 inch Retina 5K RAM 16...,2839.0,2599.0045
13955,APP1869,"Apple MacBook Pro 13 ""with Touch Bar GHz Core ...",New MacBook Pro 13 inch Touch Bar 29 GHz Core ...,2679.0,2551.5851
13252,KAN0054,Kanex Multi-Sync Gray Aluminum Bluetooth Keyboard,Bluetooth keyboard with simultaneous connectiv...,129.95,119.9897
16447,APP2283,"Apple MacBook Pro 13 ""Core i5 Touch Bar 31GHz ...",New MacBook Pro 13-inch Core i5 Touch Bar 31 G...,2489.0,2340.0045
10859,MOX0028,Moxie Simulation of Interior iPhone Case SE / ...,Cover for iPhone SE / 5s / 5.,19.99,9.9898
18516,GTE0112,G-Technology 2TB G-DRIVE mobile USB 3.0 v3,External hard drive with aluminum housing 5400...,106.99,94.9947
