# Initial Explorations of Eniacs Data

In [1]:
import pandas as pd

### Import all 4 csv files and see Dataframe dimensions

In [2]:
brands = pd.read_csv("raw_data/brands.csv", sep=',')
brands.shape

(187, 2)

In [3]:
orders = pd.read_csv("raw_data/orders.csv", sep=',')
orders.shape

(226909, 4)

In [4]:
orderlines = pd.read_csv("raw_data/orderlines.csv", sep=',')
orderlines.shape

(293983, 7)

In [48]:
products = pd.read_csv("raw_data/products.csv", sep=',')
products.shape

(19326, 7)

### Some simple explorations and checks for each dataframe

- have a look at the info() to see number of rows and columns, datatypes and if there are missing values
- use describe() to have an overview over numeric data (also categorical when no numeric data is in the dataframe)
- use .columns method to see all column names
- use head() and tail() to have a look at the first and last entries in the dataframe
- check for duplicates using duplicated()
- use value_counts() to count categorical data
- use isna() with sum() to see the total number of missing values in each column

## Brands dataframe

In [6]:
brands.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   short   187 non-null    object
 1   long    187 non-null    object
dtypes: object(2)
memory usage: 3.0+ KB


In [7]:
brands.describe()

Unnamed: 0,short,long
count,187,187
unique,187,181
top,ELG,Startech
freq,1,2


In [8]:
brands.columns

Index(['short', 'long'], dtype='object')

In [9]:
brands.head()

Unnamed: 0,short,long
0,8MO,8Mobility
1,ACM,Acme
2,ADN,Adonit
3,AII,Aiino
4,AKI,Akitio


In [10]:
brands.tail()

Unnamed: 0,short,long
182,XOO,Xoopar
183,XRI,X-Rite
184,XTO,Xtorm
185,ZAG,ZaggKeys
186,ZEP,Zepp


In [11]:
#no duplicates in the brands dataframe
brands.duplicated().value_counts()

False    187
dtype: int64

## Orders Dataframe

In [12]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226909 entries, 0 to 226908
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      226909 non-null  int64  
 1   created_date  226909 non-null  object 
 2   total_paid    226904 non-null  float64
 3   state         226909 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.9+ MB


In [13]:
orders.describe()

Unnamed: 0,order_id,total_paid
count,226909.0,226904.0
mean,413296.48248,569.225818
std,65919.250331,1761.778002
min,241319.0,0.0
25%,356263.0,34.19
50%,413040.0,112.99
75%,470553.0,525.98
max,527401.0,214747.53


In [14]:
orders.columns

Index(['order_id', 'created_date', 'total_paid', 'state'], dtype='object')

In [15]:
orders.head()

Unnamed: 0,order_id,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled


In [16]:
orders.tail()

Unnamed: 0,order_id,created_date,total_paid,state
226904,527397,2018-03-14 13:56:38,42.99,Place Order
226905,527398,2018-03-14 13:57:25,42.99,Shopping Basket
226906,527399,2018-03-14 13:57:34,141.58,Shopping Basket
226907,527400,2018-03-14 13:57:41,19.98,Shopping Basket
226908,527401,2018-03-14 13:58:36,18.98,Place Order


In [17]:
#no duplicates in the orders dataframe
orders.duplicated().value_counts()

False    226909
dtype: int64

In [18]:
#categories of orders and their amounts
orders["state"].value_counts()

Shopping Basket    117809
Completed           46605
Place Order         40883
Pending             14379
Cancelled            7233
Name: state, dtype: int64

In [19]:
orders.isna().sum()

order_id        0
created_date    0
total_paid      5
state           0
dtype: int64

### observations in orders

- 5 null values in column total_paid
- column create_date read as object but should be datetime
- would it be helpful to convert state to categorical dtype?

## Orderlines dataframe

In [20]:
orderlines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


In [21]:
orderlines.describe()

Unnamed: 0,id,id_order,product_id,product_quantity
count,293983.0,293983.0,293983.0,293983.0
mean,1397918.0,419999.116544,0.0,1.121126
std,153009.6,66344.486479,0.0,3.396569
min,1119109.0,241319.0,0.0,1.0
25%,1262542.0,362258.5,0.0,1.0
50%,1406940.0,425956.0,0.0,1.0
75%,1531322.0,478657.0,0.0,1.0
max,1650203.0,527401.0,0.0,999.0


In [22]:
orderlines.columns

Index(['id', 'id_order', 'product_id', 'product_quantity', 'sku', 'unit_price',
       'date'],
      dtype='object')

In [23]:
orderlines.head()

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
0,1119109,299539,0,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,0,1,LGE0043,399.0,2017-01-01 00:19:45
2,1119111,299541,0,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,0,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,0,1,JBL0104,23.74,2017-01-01 01:06:38


In [24]:
orderlines.tail()

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
293978,1650199,527398,0,1,JBL0122,42.99,2018-03-14 13:57:25
293979,1650200,527399,0,1,PAC0653,141.58,2018-03-14 13:57:34
293980,1650201,527400,0,2,APP0698,9.99,2018-03-14 13:57:41
293981,1650202,527388,0,1,BEZ0204,19.99,2018-03-14 13:58:01
293982,1650203,527401,0,1,APP0927,13.99,2018-03-14 13:58:36


In [25]:
#no duplicates in the order_items dataframe
orderlines.duplicated().value_counts()

False    293983
dtype: int64

In [26]:
# calculate sum of product_ids to see that they are all 0 and that the column can be droped later on
orderlines["product_id"].sum()

0

### Observations in orderlines

- no missing values
- column product_id not used anymore, always 0
- dtype for price should be float instead of object 
- dtype for unit_price should be float, dtype for date should be datetime but both are object

## Products dataframe

In [27]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


In [28]:
products.describe()

Unnamed: 0,in_stock
count,19326.0
mean,0.109593
std,0.31239
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [29]:
products.columns

Index(['sku', 'name', 'desc', 'price', 'promo_price', 'in_stock', 'type'], dtype='object')

In [30]:
products.head()

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,499.899,1,8696
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59.0,589.996,0,13855401
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59.0,569.898,0,1387
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25.0,229.997,0,1230
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,31.99,1,1364


In [31]:
products.tail()

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
19321,BEL0376,Belkin Travel Support Apple Watch Black,compact and portable stand vertically or horiz...,29.99,269.903,1,12282
19322,THU0060,"Enroute Thule 14L Backpack MacBook 13 ""Black",Backpack with capacity of 14 liter compartment...,69.95,649.903,1,1392
19323,THU0061,"Enroute Thule 14L Backpack MacBook 13 ""Blue",Backpack with capacity of 14 liter compartment...,69.95,649.903,1,1392
19324,THU0062,"Enroute Thule 14L Backpack MacBook 13 ""Red",Backpack with capacity of 14 liter compartment...,69.95,649.903,0,1392
19325,THU0063,"Enroute Thule 14L Backpack MacBook 13 ""Green",Backpack with capacity of 14 liter compartment...,69.95,649.903,1,1392


In [32]:
# 8746 duplicated values in the products dataframe - to be checked
products.duplicated().value_counts()

False    10580
True      8746
dtype: int64

In [52]:
#gives back all the dublicated rows
#they are more because before only the first occurence of a duplicate was checked
products[products.duplicated(keep=False)].count()

sku            9503
name           9503
desc           9503
price          9503
promo_price    9503
in_stock       9503
type           9503
dtype: int64

In [34]:
#number of missing values
products.isna().sum()

sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64

In [35]:
#columns that contain missing values are True
products.isna().any()

sku            False
name           False
desc            True
price           True
promo_price    False
in_stock       False
type            True
dtype: bool

### Observations in products

- missing values in columns desc, price and type
- column price, promo_price and type are from dtype object but should be float or int
- column promo_price seems to have wrong prices 499.899 instead of 49.98
- type usually is a 4-digit number but not always e.g. 13855401

## Some more custom explorations

How many orders are there?

How many products are there?

What period of time do these orders comprise?

How many orders are completed, or in other states?

How should revenue be computed?

In [45]:
products.promo_price.str.split(pat='.', n=1, expand=True).sample(10)

Unnamed: 0,0,1
1756,499,899.0
7917,38,729.898
14030,31,595.847
10107,27,249.902
11132,22,99.0
7862,37,989.898
520,199,892.0
15000,599,918.0
12027,209,935.0
16352,29,220.048


In [50]:
#correct unit_price from products from MASHA
orderlines['unit_price'] = [ i.replace(".","") for i in orderlines['unit_price'] ] 
orderlines['unit_price'] = pd.to_numeric(orderlines['unit_price'])
orderlines['unit_price'] = orderlines['unit_price']/100

AttributeError: 'float' object has no attribute 'replace'

In [51]:
orderlines.sample(5)

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
109158,1326432,393507,0,1,MAC0102,29.99,2017-08-28 16:40:30
238601,1561773,491331,0,1,PAC1327,801.18,2018-01-15 14:26:30
63570,1244867,354357,0,1,QNA0166,10.9,2017-05-10 00:02:05
182176,1469077,452308,0,1,APP1214,78.0,2017-11-30 01:42:03
57994,1235264,349620,0,1,APP1854,3114.99,2017-04-25 15:33:00


In [None]:
#correct unit_price from products from HANA
df['unit_price'] = df['unit_price'].str.rsplit('.',1).apply(lambda x : x[0]+ ',' + x[1]).str.replace('.','',regex=True).str.replace(',','.',regex=True)