# Eniac data cleaning

In [1]:
import pandas as pd
from functools import wraps
import datetime as dt

In [2]:
orderlines = pd.read_csv('data/eniac/orderlines.csv')
products = pd.read_csv('data/eniac/products.csv')
orders = pd.read_csv('data/eniac/orders.csv')

In [3]:
# Function that makes logs in the pipeline
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper

In [4]:
# Function that returns copy of DataFrame
@log_step
def start_pipeline(df):
    return df.copy()

## Products info

In [5]:
#Function that returns number of missing values
def check_missing_values(data): 
    print('Missing values:' + '\n' + str(data.isna().sum()))

#Function that returns number of duplicated rows
def check_duplicates(data): 
    print('Duplicated rows: ', data.duplicated().sum())

def check_table(df):
    check_missing_values(df)
    check_duplicates(df)
    df.info()

In [6]:
check_table(products)

Missing values:
sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64
Duplicated rows:  8746
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


### Dropping columns

I have decided that for my future analisys I don't need columns 'in_stock' and 'type'. I am passing list of colums as an argument

In [7]:
# Function that drops columns from a list of columns
@log_step
def drop_columns(df, columns):
    return df.drop(columns,axis=1)

In [8]:
columns_to_drop = ['in_stock','type']

### Dealing with NaN values

I have decided to exclude all rows that have NaN value in 'price' and 'desc' column since there are very few of them. I have forwarded function called select_nonNan_rows to pipeline.

In [9]:
@log_step
#Function that returns rows with notNaN values in 'column'
def select_notNan_rows(df,columns):
    for col in columns:
        df = df[df[col].notna()]
    return df

In [10]:
columns_where_NaNs_are = ['price','desc']

### Losing duplicates

I have decided to lose all rows that have duplicate 'sku' value.

In [11]:
@log_step
#Function that drops duplicates
def drop_duplicates_custom(df,columns=[]):
    if(len(columns) > 0):
        return df.drop_duplicates(columns)
    else:
        return df.drop_duplicates()

In [12]:
columns_arg_for_duplicates_drop = ['sku']

### Changing types 

I have to change types of 'price' and 'promo_price' from object to float

It seems like price and promo price had some problems with decimals, decimal spaces...so I have created clean_prices function that clean those. After that I was able to change types to float.

In [13]:
#Function for cleaning price and promo_price columns
@log_step
def clean_prices(p,columns):
    for col in columns:
        p[col] = p[col].apply(lambda x : x +'.00' if x.count('.') == 0 else x)
        p[col] = p[col].apply(lambda x: x  + '0' if x[-2]=='.' else x)
        p[col] = p[col].apply(lambda x: str(float(x.replace('.',''))/1000) if ( (x[-4]=='.') & (x.count('.')==2)) else x)
        p[col] = p[col].apply(lambda x: str(float(x.replace('.',''))/10000) if ( (x[-4]=='.') & (x.count('.')==1)) else x)
        p[col] = p[col].astype(float)
        p[col] = p[col].round(decimals = 2)
    return p

## Pipeline

In [14]:
p = (products
     .pipe(start_pipeline)
     .pipe(drop_columns,columns_to_drop)
     .pipe(select_notNan_rows,columns_where_NaNs_are)
     .pipe(drop_duplicates_custom,columns_arg_for_duplicates_drop)
     .pipe(clean_prices,['price','promo_price'])
)

just ran step start_pipeline shape=(19326, 7) took 0:00:00.001000s
just ran step drop_columns shape=(19326, 5) took 0:00:00.002990s
just ran step select_notNan_rows shape=(19273, 5) took 0:00:00.010967s
just ran step drop_duplicates_custom shape=(10527, 5) took 0:00:00.010493s
just ran step clean_prices shape=(10527, 5) took 0:00:00.059844s


In [15]:
check_table(p)

Missing values:
sku            0
name           0
desc           0
price          0
promo_price    0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10527 entries, 0 to 19325
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sku          10527 non-null  object 
 1   name         10527 non-null  object 
 2   desc         10527 non-null  object 
 3   price        10527 non-null  float64
 4   promo_price  10527 non-null  float64
dtypes: float64(2), object(3)
memory usage: 493.5+ KB


In [16]:
p.sample(20)

Unnamed: 0,sku,name,desc,price,promo_price
19109,SEA0124,Seagate Backup Plus Hub USB 3.0 Hard Drive 4TB,Seagate desktop hard drive with 4TB compatible...,149.99,125.0
13324,APP1756,Apple iPad Mini 4 Wi-Fi + Cellular 32GB Gold,Apple iPad Mini 4 Wi-Fi + Cellular 32GB.,549.0,552.81
14937,PAC1888,Crucial MX300 275GB SSD expansion kit for Mac ...,SSD upgrade kit for Mac mini 275GB Mid 2011 to...,153.38,128.58
937,LAC0106,LaCie Little Big Disk Thunderbolt SSD 1TB hard...,Portable External Hard Drive 1TB SSD Thunderbo...,1249.0,76.99
19185,RIN0017,Chime Bell Ring,Chime bell with free Wi-Fi to amplify your not...,35.0,35.0
17934,PAC2446,Synology DS918 + NAS Server | 16GB RAM,NAS server of the Plus Series for companies se...,772.71,760.99
18029,SAN0191,SanDisk iXpand Base 32GB flash memory Charger,32GB flash memory with charger function,62.99,59.99
540,SEV0032,Budget Request repair Apple Mac Pro,Diagnosis to repair Mac Pro.,19.99,19.99
13342,APP1659,Apple iPhone 6s Plus 32GB Silver,New iPhone 6S Plus 32GB Free.,639.0,626.0
1407,LEP0016,Lepow USB 6000mAh External Battery Moonstone Red,external battery USB dual mode charge small an...,59.99,30.0
