# Eniac data cleaning

In [1]:
import pandas as pd
from functools import wraps
import datetime as dt
from Functions_module import *

In [2]:
orderlines = pd.read_csv('data/eniac/orderlines.csv')
products = pd.read_csv('data/eniac/products.csv')
orders = pd.read_csv('data/eniac/orders.csv')

## Products info

In [3]:
check_table(products)

Missing values:
sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64
Duplicated rows:  8746
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


### Dropping columns

I have decided that for my future analisys I don't need columns 'in_stock' and 'type'. I am passing list of colums as an argument

In [4]:
columns_to_drop = ['in_stock','type']

### Dealing with NaN values

I have decided to exclude all rows that have NaN value in 'price' and 'desc' column since there are very few of them. I have forwarded function called select_nonNan_rows to pipeline.

In [5]:
columns_where_NaNs_are = ['price','desc']

### Losing duplicates

I have decided to lose all rows that have duplicate 'sku' value.

In [6]:
columns_arg_for_duplicates_drop = ['sku']

### Changing types 

I have to change types of 'price' and 'promo_price' from object to float

It seems like price and promo price had some problems with decimals, decimal spaces...so I have created clean_promo_price and clean_price functions that clean those. After that I was able to change types to float.

## Pipeline

In [7]:
p = (products
     .pipe(start_pipeline)
     .pipe(drop_columns,columns_to_drop)
     .pipe(select_notNan_rows,columns_where_NaNs_are)
     .pipe(drop_duplicates_custom,columns_arg_for_duplicates_drop)
     .pipe(clean_prices,['price','promo_price'])
)

just ran step start_pipeline shape=(19326, 7) took 0:00:00.002980s
just ran step drop_columns shape=(19326, 5) took 0:00:00.005986s
just ran step select_notNan_rows shape=(19273, 5) took 0:00:00.013970s
just ran step drop_duplicates_custom shape=(10527, 5) took 0:00:00.008977s
just ran step clean_prices shape=(10527, 5) took 0:00:00.074923s


In [8]:
check_table(p)

Missing values:
sku            0
name           0
desc           0
price          0
promo_price    0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10527 entries, 0 to 19325
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sku          10527 non-null  object 
 1   name         10527 non-null  object 
 2   desc         10527 non-null  object 
 3   price        10527 non-null  float64
 4   promo_price  10527 non-null  float64
dtypes: float64(2), object(3)
memory usage: 493.5+ KB


In [9]:
p.sample(20)

Unnamed: 0,sku,name,desc,price,promo_price
17985,APP2543,Apple iPhone Gold Lightning Dock,Stand with Lightning Dock Connector and iPhone.,59.0,56.0
18456,APP2687,"Apple iMac Pro 27 ""14-core Intel Xeon W 25GHz ...",Pro iMac 27 inch screen Retina 5K and Intel Xe...,8379.0,787.6005
14584,SAT0017,Sonic Dual Conical Satechi Mac v2.0 Speakers W...,Speakers matte finish sleek design and volume ...,39.99,24.9901
2297,TRA0025,Transcend JetDrive expansion SSD 720 960GB Mac...,960GB SSD expansion for Macbook Pro Retina 13-...,815.0,728.5834
16952,CRU0015-2-A,Open - Crucial Mac Memory 16GB (2x8GB) DDR3 16...,RAM 16GB (2x8GB) 135V MacBook Pro iMac (2012/2...,149.98,148.9897
13599,WAC0178-A,(Open) Wacom Bamboo Stylus Black Alpha 2,Intelligent digital pen with rubber tip for iP...,14.9,9.2183
14938,PAC1889,Kit Crucial MX300 525GB SSD expansion for Mac ...,SSD upgrade kit for Mac mini 525GB Mid 2011 to...,213.65,183.5848
18098,PAC2463,DS218play Synology NAS Server | 6TB (2x3TB) Se...,2-bay NAS server can accommodate 4K Ultra HD f...,520.97,433.1788
16196,PAC2081,"Apple iMac 27 ""Core i7 42GHz 5K Retina | 16GB ...",IMac desktop computer 27 inch Retina 5K RAM 16...,3085.59,2825.0045
15875,ALL0010,PowerCube Original USB Regleta Allocacoc wall ...,Wall socket cube-shaped with 4 to 250V electri...,18.95,14.9895
