# Cleaning Eniac data - Products

In [2]:
import pandas as pd
from functools import wraps
import datetime as dt

In [3]:
orderlines = pd.read_csv('data/eniac/orderlines.csv')
products = pd.read_csv('data/eniac/products.csv')
orders = pd.read_csv('data/eniac/orders.csv')

In [4]:
# Function that makes logs in the pipeline
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper

In [5]:
# Function that returns copy of DataFrame
@log_step
def start_pipeline(df):
    return df.copy()

## Products info

In [6]:
#Function that returns number of missing values
def check_missing_values(data): 
    print('Missing values:' + '\n' + str(data.isna().sum()))

#Function that returns number of duplicated rows
def check_duplicates(data): 
    print('Duplicated rows: ', data.duplicated().sum())

def check_table(df):
    check_missing_values(df)
    check_duplicates(df)
    df.info()

In [7]:
check_table(products)

Missing values:
sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64
Duplicated rows:  8746
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


### Dropping columns

I have decided that for my future analisys I don't need columns 'in_stock' and 'type'. I am passing list of colums as an argument

In [8]:
# Function that drops columns from a list of columns
@log_step
def drop_columns(df, columns):
    return df.drop(columns,axis=1)

In [9]:
columns_to_drop = ['in_stock','type']

### Dealing with NaN values

I have decided to exclude all rows that have NaN value in 'price' and 'desc' column since there are very few of them. I have forwarded function called select_nonNan_rows to pipeline.

In [10]:
@log_step
#Function that returns rows with notNaN values in 'column'
def select_notNan_rows(df,columns):
    for col in columns:
        df = df[df[col].notna()]
    return df

In [11]:
columns_where_NaNs_are = ['price','desc']

### Losing duplicates

I have decided to lose all rows that have duplicate 'sku' value.

In [12]:
@log_step
#Function that drops duplicates
def drop_duplicates_custom(df,columns=[]):
    if(len(columns) > 0):
        return df.drop_duplicates(columns)
    else:
        return df.drop_duplicates()

In [13]:
columns_arg_for_duplicates_drop = ['sku']

### Changing types 

I have to change types of 'price' and 'promo_price' from object to float

It seems like price and promo price had some problems with decimals, decimal spaces...so I have created clean_prices function that clean those. After that I was able to change types to float.

In [14]:
#Function for cleaning price and promo_price columns
@log_step
def clean_prices(p,columns):
    for col in columns:
        p[col] = p[col].apply(lambda x : x +'.00' if x.count('.') == 0 else x)
        p[col] = p[col].apply(lambda x: x  + '0' if x[-2]=='.' else x)
        p[col] = p[col].apply(lambda x: str(float(x.replace('.',''))/1000) if ( (x[-4]=='.') & (x.count('.')==2)) else x)
        p[col] = p[col].apply(lambda x: str(float(x.replace('.',''))/10000) if ( (x[-4]=='.') & (x.count('.')==1)) else x)
        p[col] = p[col].astype(float)
        p[col] = p[col].round(decimals = 2)
    return p

## Pipeline

In [18]:
p = (products
     .pipe(start_pipeline)
     .pipe(drop_columns,columns_to_drop)
     .pipe(select_notNan_rows,columns_where_NaNs_are)
     .pipe(drop_duplicates_custom,columns_arg_for_duplicates_drop)
     .pipe(clean_prices,['price','promo_price'])
)

just ran step start_pipeline shape=(19326, 7) took 0:00:00.000997s
just ran step drop_columns shape=(19326, 5) took 0:00:00.002032s
just ran step select_notNan_rows shape=(19273, 5) took 0:00:00.006945s
just ran step drop_duplicates_custom shape=(10527, 5) took 0:00:00.004979s
just ran step clean_prices shape=(10527, 5) took 0:00:00.043914s


# Creating new clean csv

In [19]:
p.to_csv('data/eniac/products_clean.csv',index=False)

In [215]:
check_table(p)

Missing values:
sku            0
name           0
desc           0
price          0
promo_price    0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10527 entries, 0 to 19325
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sku          10527 non-null  object 
 1   name         10527 non-null  object 
 2   desc         10527 non-null  object 
 3   price        10527 non-null  float64
 4   promo_price  10527 non-null  float64
dtypes: float64(2), object(3)
memory usage: 493.5+ KB


In [216]:
products.sample(20)

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
18679,WAC0220-A,Open - Wacom Intuos Pro Paper L South,Reconditioned large graphics tablet lets you d...,599.9,4.518.267,0,1405
11278,PHI0061,Philips HUE Bridge 2.0 for lamps and bulbs Hue,Bridge intelligent control of lights and acces...,59.95,498.992,1,11905404
5339,PAC1050,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 32GB...",IMac desktop computer 27 inch 5K Retina i5 3.3...,4069.0,34.809.897,0,"5,74E+15"
13863,APP1854,"Apple MacBook Pro 13 ""with Touch Bar 33GHz Cor...",New MacBook Pro 13 inch Touch Bar 33 GHz Core ...,3279.0,29.655.951,0,2158
2034,MOS0058,"PalmGuard Moshi protector Macbook Air 13 """,Handrests protector Macbook 13 inches.,19.99,199.892,0,13835403
18443,APP2674,"Apple iMac Pro 27 ""10-core Intel Xeon 3GHz W |...",Pro iMac 27 inch screen Retina 5K and Intel Xe...,7179.0,67.480.042,0,118692158
10922,PAC1630,QNAP HS-251 + | Seagate 20TB Iron Wolf,QNAP Pack HS-251 + with 20TB (2x10TB) Seagate ...,1198.97,9.051.792,0,12175397
17547,XDO0047,X-Doria Defense Shield iPhone Case 8/7 Black,Durability that combines polycarbonate and ano...,32.99,259.896,1,11865403
3296,APP1384,"Apple iMac 27 ""Core i7 Retina 5K 4GHz | 8GB | ...",IMac desktop computer 27 inch 8GB RAM 2TB 5K R...,3229.0,30.745.846,0,"5,74E+15"
17544,OWC0253,Mac OWC Memory 2GB DDR2 533MHz DIMM,RAM 1GB iMac G5 / PowerMac G5.,22.99,22.99,1,1364


In [217]:
p.sample(20)

Unnamed: 0,sku,name,desc,price,promo_price
18883,PAC2286,"Second hand - Apple LED Cinema Display 24 """,Monitor Refurbished Apple Cinema Display 24 inch,899.0,499.0
18927,TWS0118-A,Open - Twelve South HiRise Duet,Support refitted with Lightning connector and ...,129.99,82.1
14039,APP1876,"Apple MacBook Pro 15 ""Core i7 Touch Bar 29GHz ...",New MacBook Pro 15-inch Core i7 Touch Bar 29Gh...,4039.0,3843.58
3010,PAR0055,Jumping Parrot Drone Red Night Marshall,Vaulter remote control vehicle s night with LE...,159.9,139.9
15787,PLA0028,Plantronics BackBeat Fit Fuchsia Wireless Head...,secure wireless headset sport waterproof desig...,129.99,99.99
13180,PAC1685,Pack QNAP TS-251A NAS Server | 2GB RAM | Seaga...,NAS with 2GB of RAM and 16TB (2x8TB) Seagate I...,1069.67,751.18
16849,GLY0012,Glyph StudioRAID System 7200rpm HDD RAID 4TB T...,4TB RAID with Thunderbolt connection 2 and 2 H...,517.04,414.99
14708,QNA0186,QNAP TS-EC880U Xeon E3-1246 R2 Server Nas | 4G...,Nas 8-bay rackmount 4GB RAM with 4 USB 3.0 4 E...,3143.99,3143.99
18397,FIB0009,Fibaro Water Sensor Flood Sensor HomeKit,multi-function sensor for water leaks compatib...,69.99,69.99
11755,PAC1733,QNAP TS-128 Server l Nas 8TB (1x8TB) Seagate I...,NAS TS-128 1 8TB hard drive for Mac and PC,478.98,339.58


In [218]:
p.price.sum()

7452104.829999999