# Custom exploration of Eniac's database

In [7]:
import pandas as pd
from Functions_module import check_table

In [4]:
orderlines_old = pd.read_csv('data/eniac/orderlines.csv')
products_old = pd.read_csv('data/eniac/products.csv')
orders_old = pd.read_csv('data/eniac/orders.csv')

In [5]:
orderlines = pd.read_csv('data/eniac/orderlines_clean.csv')
products = pd.read_csv('data/eniac/products_clean.csv')
orders = pd.read_csv('data/eniac/orders_clean.csv')

# Orderlines info

In [8]:
check_table(orderlines)

Missing values:
Unnamed: 0          0
id                  0
order_id            0
product_quantity    0
sku                 0
unit_price          0
date                0
total_sum           0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292854 entries, 0 to 292853
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        292854 non-null  int64  
 1   id                292854 non-null  int64  
 2   order_id          292854 non-null  int64  
 3   product_quantity  292854 non-null  int64  
 4   sku               292854 non-null  object 
 5   unit_price        292854 non-null  float64
 6   date              292854 non-null  object 
 7   total_sum         292854 non-null  float64
dtypes: float64(2), int64(4), object(2)
memory usage: 17.9+ MB


In [9]:
check_table(orderlines_old)

Missing values:
id                  0
id_order            0
product_id          0
product_quantity    0
sku                 0
unit_price          0
date                0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


# Products info

In [10]:
check_table(products)

Missing values:
Unnamed: 0     0
sku            0
name           0
desc           0
price          0
promo_price    0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10527 entries, 0 to 10526
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   10527 non-null  int64  
 1   sku          10527 non-null  object 
 2   name         10527 non-null  object 
 3   desc         10527 non-null  object 
 4   price        10527 non-null  float64
 5   promo_price  10527 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 493.6+ KB


In [11]:
check_table(products_old)

Missing values:
sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64
Duplicated rows:  8746
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


## Orders info

In [12]:
check_table(orders)

Missing values:
Unnamed: 0      0
Unnamed: 0.1    0
order_id        0
created_date    0
total_paid      0
state           0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226424 entries, 0 to 226423
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    226424 non-null  int64  
 1   Unnamed: 0.1  226424 non-null  int64  
 2   order_id      226424 non-null  int64  
 3   created_date  226424 non-null  object 
 4   total_paid    226424 non-null  float64
 5   state         226424 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 10.4+ MB


In [13]:
check_table(orders_old)

Missing values:
order_id        0
created_date    0
total_paid      5
state           0
dtype: int64
Duplicated rows:  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226909 entries, 0 to 226908
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      226909 non-null  int64  
 1   created_date  226909 non-null  object 
 2   total_paid    226904 non-null  float64
 3   state         226909 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.9+ MB


## How many orders are there?

In [8]:
orderlines.id_order.nunique()

204855

## How many products are there?

In [11]:
products.sku.nunique()

10579

## What period of time do these orders comprise?

In [18]:
orders.created_date.agg(['min', 'max'])

min    2017-01-01 00:07:19
max    2018-03-14 13:58:36
Name: created_date, dtype: object

## How many orders are Completed?

In [19]:
orders.state.unique()

array(['Cancelled', 'Completed', 'Pending', 'Shopping Basket',
       'Place Order'], dtype=object)

In [23]:
orders.loc[orders.state == 'Completed'].state.value_counts()

Completed    46605
Name: state, dtype: int64

## How should revenue be computed?

First, I have to convert 'unit_price' to Float

In [43]:
orderlines['unit_price'] = orderlines['unit_price'].str.rsplit('.',1).apply(lambda x : x[0]+ ',' + x[1])
orderlines['unit_price'] = orderlines['unit_price'].str.replace('.','',regex=True)
orderlines['unit_price'] = orderlines['unit_price'].str.replace(',','.',regex=True)
orderlines['unit_price'] = orderlines['unit_price'].astype(float)

Revenue for entire database

In [44]:
orderlines.assign(total_price = orderlines.unit_price * orderlines.product_quantity)['total_price'].sum()

128776222.02999999

To use date column as datetime:

In [49]:
orderlines['date'] = pd.to_datetime(orderlines['date'])

Revenue for each year:

In [52]:
orderlines.assign(total_price = orderlines.unit_price * orderlines.product_quantity, year = orderlines.date.dt.strftime('%Y')).groupby('year').agg('sum')

Unnamed: 0_level_0,id,id_order,product_id,product_quantity,unit_price,total_price
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017,292571254920,86093077640,0,248816,83522983.71,90144159.46
2018,118392903252,37379522639,0,80776,36967475.1,38632062.57


In [196]:
new= (
products
    .assign(length = lambda x: x['price'].str.len())
)

In [197]:
new =(
new
    .query('length >= 5')
    .assign(check_dot = lambda x: x['price'].str[-4] == '.')
    .query('check_dot == True')
    
)

In [194]:
new.sample(20)

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type,length,check_dot
1057,MOP0057,Mophie Space Pack Battery Case (1700mAh) and S...,Housing with battery and 16GB external storage...,1.329.911,1.329.911,0,"5,49E+11",9.0,True
15407,REP0296,iPhone battery repair,Repair service including parts and labor for i...,483.879,483.879,0,"1,44E+11",7.0,True
1459,PAC0562,Kingston V300 SSD expansion kit for Mac mini 1...,SSD upgrade kit for Mac mini 120GB 2011 Late 2...,1.169.795,1.065.841,0,1433,9.0,True
11933,SAN0097,SanDisk Ultra Fit 128GB USB 3.0 Flash Drive,Ultra compact flash drive USB 3.0 128GB Mac an...,349.932,349.932,0,57445397,7.0,True
898,REP0156,iPhone 5 GSM antenna repair,Repair service including parts and labor for i...,69.989.909,699.899,0,"1,44E+11",10.0,True
14525,QNA0183,QNAP TVS-871U-RP NAS server i3 4GB RAM,Rail format NAS 8 bays 4 Ethernet ports (10GbE...,21.759.902,21.759.902,0,12175397,10.0,True
15992,PAC1983,Synology DS1817 + | 16GB RAM | 64TB (8x8TB) Se...,NAS with 16GB of RAM and 64TB for Mac and PC,40.377.892,2.905.746,0,12175397,10.0,True
12868,PAC2025,Synology DS216 + II | 2GB RAM,NAS with 4K transcoding and direct copy button...,3.739.696,3.861.788,0,12175397,9.0,True
15646,PIE0017,Piece Original SIM tray iPhone 3G / 3GS Black,original piece of SIM card tray for iPhone 3G ...,69.938,69.938,0,21485407,6.0,True
15439,REP0382,Home button repair iPad Mini 3,Repair service including parts and labor for i...,6.999.003,69.99,0,"1,44E+11",9.0,True
