In [1]:
import json

In [2]:
import pandas as pd

In [3]:
def get_column_names(schemas, ds_name, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

In [4]:
schemas = json.load(open('data/retail_db/schemas.json'))

In [5]:
orders_columns = get_column_names(schemas, 'orders')

In [6]:
orders = pd.read_csv(
    'data/retail_db/orders/part-00000',
    names=orders_columns
)

In [7]:
orders

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [8]:
help(orders.sort_values)

Help on method sort_values in module pandas.core.frame:

sort_values(by: 'IndexLabel', axis: 'Axis' = 0, ascending: 'bool | list[bool] | tuple[bool, ...]' = True, inplace: 'bool' = False, kind: 'str' = 'quicksort', na_position: 'str' = 'last', ignore_index: 'bool' = False, key: 'ValueKeyFunc' = None) -> 'DataFrame | None' method of pandas.core.frame.DataFrame instance
    Sort by the values along either axis.
    
    Parameters
    ----------
            by : str or list of str
                Name or list of names to sort by.
    
                - if `axis` is 0 or `'index'` then `by` may contain index
                  levels and/or column labels.
                - if `axis` is 1 or `'columns'` then `by` may contain column
                  levels and/or index labels.
    axis : {0 or 'index', 1 or 'columns'}, default 0
         Axis to be sorted.
    ascending : bool or list of bool, default True
         Sort ascending vs. descending. Specify list for multiple sort
         order

In [9]:
orders.sort_values('order_customer_id')

Unnamed: 0,order_id,order_date,order_customer_id,order_status
22944,22945,2013-12-13 00:00:00.0,1,COMPLETE
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD
...,...,...,...,...
42914,42915,2014-04-16 00:00:00.0,12434,COMPLETE
13543,13544,2013-10-16 00:00:00.0,12434,PENDING
5302,5303,2013-08-26 00:00:00.0,12434,PENDING
41642,41643,2014-04-08 00:00:00.0,12435,PENDING


In [10]:
orders.sort_values('order_customer_id', ascending=False)

Unnamed: 0,order_id,order_date,order_customer_id,order_status
41642,41643,2014-04-08 00:00:00.0,12435,PENDING
61628,61629,2013-12-21 00:00:00.0,12435,CANCELED
4798,4799,2013-08-23 00:00:00.0,12434,PENDING_PAYMENT
5302,5303,2013-08-26 00:00:00.0,12434,PENDING
1867,1868,2013-08-03 00:00:00.0,12434,CLOSED
...,...,...,...,...
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD


In [11]:
orders.sort_values(['order_customer_id', 'order_date'])

Unnamed: 0,order_id,order_date,order_customer_id,order_status
22944,22945,2013-12-13 00:00:00.0,1,COMPLETE
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
...,...,...,...,...
61776,61777,2013-12-26 00:00:00.0,12434,COMPLETE
42914,42915,2014-04-16 00:00:00.0,12434,COMPLETE
51799,51800,2014-06-14 00:00:00.0,12434,ON_HOLD
61628,61629,2013-12-21 00:00:00.0,12435,CANCELED


In [12]:
orders.sort_values(
    ['order_customer_id', 'order_date'], ascending=False
)

Unnamed: 0,order_id,order_date,order_customer_id,order_status
41642,41643,2014-04-08 00:00:00.0,12435,PENDING
61628,61629,2013-12-21 00:00:00.0,12435,CANCELED
51799,51800,2014-06-14 00:00:00.0,12434,ON_HOLD
42914,42915,2014-04-16 00:00:00.0,12434,COMPLETE
61776,61777,2013-12-26 00:00:00.0,12434,COMPLETE
...,...,...,...,...
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD


In [13]:
orders.sort_values(
    ['order_customer_id', 'order_date'], ascending=[True, False]
)

Unnamed: 0,order_id,order_date,order_customer_id,order_status
22944,22945,2013-12-13 00:00:00.0,1,COMPLETE
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD
...,...,...,...,...
5302,5303,2013-08-26 00:00:00.0,12434,PENDING
4798,4799,2013-08-23 00:00:00.0,12434,PENDING_PAYMENT
1867,1868,2013-08-03 00:00:00.0,12434,CLOSED
41642,41643,2014-04-08 00:00:00.0,12435,PENDING
