## Projecting and Filtering

In [6]:
import pandas as pd

In [7]:
orders_path = "/data/retail_db/orders/part-00000"

In [8]:
orders_schema = [
    "order_id",
    "order_date",
    "order_customer_id",
    "order_status"
]

In [9]:
orders = pd.read_csv(orders_path,
                     delimiter=',',
                     header=None,
                     names=orders_schema
                    )

In [15]:
order_items_path = "/data/retail_db/order_items/part-00000"

In [21]:
order_items_schema = [
    "order_item_id",
    "order_item_order_id",
    "order_item_product_id",
    "order_item_quantity",
    "order_item_subtotal",
    "order_item_product_price"
]

In [22]:
orders = pd.read_csv(orders_path,
                     delimiter=',',
                     header=None,
                     names=orders_schema
                    )

In [23]:
order_items = pd.read_csv(order_items_path,
                     delimiter=',',
                     header=None,
                     names=order_items_schema
                    )

* Projecting data

In [24]:
orders.order_date

0        2013-07-25 00:00:00.0
1        2013-07-25 00:00:00.0
2        2013-07-25 00:00:00.0
3        2013-07-25 00:00:00.0
4        2013-07-25 00:00:00.0
                 ...          
68878    2014-07-09 00:00:00.0
68879    2014-07-13 00:00:00.0
68880    2014-07-19 00:00:00.0
68881    2014-07-22 00:00:00.0
68882    2014-07-23 00:00:00.0
Name: order_date, Length: 68883, dtype: object

In [25]:
orders['order_date']

0        2013-07-25 00:00:00.0
1        2013-07-25 00:00:00.0
2        2013-07-25 00:00:00.0
3        2013-07-25 00:00:00.0
4        2013-07-25 00:00:00.0
                 ...          
68878    2014-07-09 00:00:00.0
68879    2014-07-13 00:00:00.0
68880    2014-07-19 00:00:00.0
68881    2014-07-22 00:00:00.0
68882    2014-07-23 00:00:00.0
Name: order_date, Length: 68883, dtype: object

In [26]:
# Project order_item_order_id and order_item_subtotal
order_items[['order_item_order_id', 'order_item_subtotal']]

Unnamed: 0,order_item_order_id,order_item_subtotal
0,1,299.98
1,2,199.99
2,2,250.00
3,2,129.99
4,4,49.98
...,...,...
172193,68881,129.99
172194,68882,59.99
172195,68882,50.00
172196,68883,1999.99


* Filter for order_item_order_id 2

In [27]:
order_items[order_items.order_item_order_id == 2]

Unnamed: 0,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.0,50.0
3,4,2,403,1,129.99,129.99


In [28]:
order_items[order_items['order_item_order_id'] == 2]

Unnamed: 0,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.0,50.0
3,4,2,403,1,129.99,129.99


In [29]:
order_items['order_item_order_id'] == 2

0         False
1          True
2          True
3          True
4         False
          ...  
172193    False
172194    False
172195    False
172196    False
172197    False
Name: order_item_order_id, Length: 172198, dtype: bool

In [30]:
order_items.query('order_item_order_id == 2')

Unnamed: 0,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.0,50.0
3,4,2,403,1,129.99,129.99


In [31]:
order_items[
    (order_items.order_item_order_id == 2) &
    ((order_items.order_item_subtotal >= 150) &
     (order_items.order_item_subtotal <= 250)
    )]

Unnamed: 0,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.0,50.0


```{note}
String passed to `query` API is broken into multiple lines for readability purposes.
```

In [32]:
order_items.query('order_item_order_id == 2 and ' +
                  'order_item_subtotal >= 150 and ' +
                  'order_item_subtotal <= 250')

Unnamed: 0,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.0,50.0


In [33]:
orders[orders.order_date == '2013-08-01 00:00:00.0']

Unnamed: 0,order_id,order_date,order_customer_id,order_status
1296,1297,2013-08-01 00:00:00.0,11607,COMPLETE
1297,1298,2013-08-01 00:00:00.0,5105,CLOSED
1298,1299,2013-08-01 00:00:00.0,7802,COMPLETE
1299,1300,2013-08-01 00:00:00.0,553,PENDING_PAYMENT
1300,1301,2013-08-01 00:00:00.0,1604,PENDING_PAYMENT
...,...,...,...,...
57959,57960,2013-08-01 00:00:00.0,10177,PENDING
57960,57961,2013-08-01 00:00:00.0,835,COMPLETE
57961,57962,2013-08-01 00:00:00.0,10521,PENDING_PAYMENT
67446,67447,2013-08-01 00:00:00.0,8956,COMPLETE


```{note}
We can use the functions available as part of `str` usng `python` as engine.
```

In [34]:
orders.query('order_date.str.startswith("2013-08-01")', engine='python')

Unnamed: 0,order_id,order_date,order_customer_id,order_status
1296,1297,2013-08-01 00:00:00.0,11607,COMPLETE
1297,1298,2013-08-01 00:00:00.0,5105,CLOSED
1298,1299,2013-08-01 00:00:00.0,7802,COMPLETE
1299,1300,2013-08-01 00:00:00.0,553,PENDING_PAYMENT
1300,1301,2013-08-01 00:00:00.0,1604,PENDING_PAYMENT
...,...,...,...,...
57959,57960,2013-08-01 00:00:00.0,10177,PENDING
57960,57961,2013-08-01 00:00:00.0,835,COMPLETE
57961,57962,2013-08-01 00:00:00.0,10521,PENDING_PAYMENT
67446,67447,2013-08-01 00:00:00.0,8956,COMPLETE
