## Reading files into collections

Let us understand how to read data from files into collections.
* Python have simple and yet rich APIs to perform file I/O
* We can create a file object with open in different modes (by default read only mode)
* To read the contents from the file into memory, we have APIs on top of file object such as read()
* read() will create large string using contents of the files
* If the data have multiple records with new line character as delimiter, we can apply splitlines() on the output of read
* splitlines() will convert the string into list with new line character as delimiter

In [None]:
%%sh

ls -ltr /data/retail_db/orders/part-00000

In [None]:
%%sh

tail /data/retail_db/orders/part-00000

In [67]:
file_path = '/home/iamismail/다운로드/DataEngineering/data-engineering-spark/data/retail_db/orders/part-00000'
order_file = open(file_path)

In [68]:
##let's put the memorry in our file 
order_raw = order_file.read()
type(order_raw)

str

In [69]:
#splitlines() function will return list 
orders = order_raw.splitlines()
type(orders)

list

In [70]:
orders[:10]

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE',
 '7,2013-07-25 00:00:00.0,4530,COMPLETE',
 '8,2013-07-25 00:00:00.0,2911,PROCESSING',
 '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT',
 '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']

In [71]:
len(orders)

68883

In [72]:
orders[:10]

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE',
 '7,2013-07-25 00:00:00.0,4530,COMPLETE',
 '8,2013-07-25 00:00:00.0,2911,PROCESSING',
 '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT',
 '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']

In [None]:
len(orders) # same as number of records in the file

In [None]:
##Row level transformations 
## our data is in text file format ##each line is sperated by coma 
                                    #order_id , order_date, order_customer_id,order_status

In [25]:
##Task One : Get all order ids and associated statuses -> ('1,Closed')
##.join function : we use to concate two list in one string 
orders[:10]
one_order ='1,2013-07-25 00:00:00.0,11599,CLOSED'
one_order.split(',')[0]
one_order.split(',')[3]

','.join([one_order.split(',')[0],one_order.split(',')[3]])

'1,CLOSED'

In [36]:
order_statuses = []
for order in orders:
    order_statuses.append(','.join([order.split(',')[0],order.split(',')[3]]))
    

In [37]:
order_statuses[:5]

['1,CLOSED', '2,PENDING_PAYMENT', '3,COMPLETE', '4,CLOSED', '5,COMPLETE']

In [40]:
order_values = order.split(',')
order_values

['68883', '2014-07-23 00:00:00.0', '5533', 'COMPLETE']

In [50]:
##Task Two :Get all order ids , the dates on which order is places and order status 
##each record in the ouput should be dict with these columns order_id,order_date,order_status

def get_order_details(order):
    order_values = order.split(',')

    return({
        'order_id' :int(order_values[0]),
        'order_date':order_values[1],
        'order_status':order_values[3]
    })

In [51]:
get_order_details('68883, 2014-07-23 00:00:00.0, 5533, COMPLETE')

{'order_id': 68883,
 'order_date': ' 2014-07-23 00:00:00.0',
 'order_status': ' COMPLETE'}

In [53]:
empty_list = []
for item in orders:

    data =     get_order_details(item)
    empty_list.append(data)
empty_list[:4]


[{'order_id': 1,
  'order_date': '2013-07-25 00:00:00.0',
  'order_status': 'CLOSED'},
 {'order_id': 2,
  'order_date': '2013-07-25 00:00:00.0',
  'order_status': 'PENDING_PAYMENT'},
 {'order_id': 3,
  'order_date': '2013-07-25 00:00:00.0',
  'order_status': 'COMPLETE'},
 {'order_id': 4,
  'order_date': '2013-07-25 00:00:00.0',
  'order_status': 'CLOSED'}]

In [87]:
##Task Three : get all unique dates from order data 
## Hint : in set we only use unique data 
unique_dates =set()

for order_date in orders:
   
    unique_dates.add(order_date.split(',')[1])
list(unique_dates)[:10]

['2014-02-27 00:00:00.0',
 '2014-03-25 00:00:00.0',
 '2014-03-31 00:00:00.0',
 '2013-08-08 00:00:00.0',
 '2013-08-19 00:00:00.0',
 '2013-10-08 00:00:00.0',
 '2014-05-27 00:00:00.0',
 '2014-02-13 00:00:00.0',
 '2014-03-17 00:00:00.0',
 '2014-05-08 00:00:00.0']

In [140]:
##Get all unique weekend dates from order data 
import datetime as dt 
 
def is_get_unique_weekend(order_date):
   return dt.datetime.strptime(order_date,'%Y-%m-%d %H:%M:%S.%f').weekday() in (5,6)
    

In [142]:
weekend_dates = set()
for order in orders :
    order_date = order.split(',')[1]

    if is_get_unique_weekend(order_date):
        weekend_dates.add(order_date)
list(weekend_dates)[:10]

['2014-01-05 00:00:00.0',
 '2013-11-09 00:00:00.0',
 '2013-07-28 00:00:00.0',
 '2013-12-28 00:00:00.0',
 '2013-12-08 00:00:00.0',
 '2014-01-12 00:00:00.0',
 '2013-08-03 00:00:00.0',
 '2014-04-20 00:00:00.0',
 '2014-01-11 00:00:00.0',
 '2014-06-15 00:00:00.0']

In [149]:
##Task four : create function which takes orders list and customer_id and return all the orders place by 
 ##customerid 
one_order = '1,2013-07-25 00:00:00.0,11599,CLOSED'
order = int(one_order.split(',')[2])
order

11599

In [151]:
def get_customes_orders(orders,customer_id):
    orders_filtered = []
    for order in orders:
        if (int(order.split(',')[2])==customer_id):
            orders_filtered.append(order)
    
    return orders_filtered

In [152]:
get_customes_orders(orders,11599)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '11397,2013-10-03 00:00:00.0,11599,COMPLETE',
 '23908,2013-12-20 00:00:00.0,11599,COMPLETE',
 '53545,2014-06-27 00:00:00.0,11599,PENDING',
 '59911,2013-10-17 00:00:00.0,11599,PROCESSING']

In [186]:
##Task five: create function that can return all the orders placed by customer_id for a given month 
 
def get_custom_orders(orders, customer_id,order_month):
    orders_filtered =[]
    for order in orders:
          if int(order.split(',')[2]) == customer_id and order.split(',')[1].startswith(order_month) :

              orders_filtered.append(order)
    return orders_filtered


In [187]:
get_custom_orders(orders,12431,'2014-01')

['27585,2014-01-12 00:00:00.0,12431,PROCESSING',
 '28244,2014-01-15 00:00:00.0,12431,PENDING_PAYMENT',
 '29109,2014-01-21 00:00:00.0,12431,ON_HOLD',
 '29232,2014-01-21 00:00:00.0,12431,ON_HOLD']

In [183]:
##Task Six : write function which can get all the orders place by customer 12431 in january 2014 and status is PENDING PAYMENT OR PROCESSING 

def get_custom_orders (orders,customer_id, order_date , order_status):
    orders_filtered = []
    orders_status =['PENDING_PAYMENT','PROCESSING']
    for order in orders :


        if int(order.split(',')[2]) == customer_id and order.split(',')[1].startswith(order_date) and order.split(',')[3] == 'PENDING_PAYMENT':
            orders_filtered.append(order)
        return orders_filtered


In [194]:
for order in orders :
    if (int(order.split(',')[2]) ==12431 \
         and order.split(',')[1].startswith('2014-01') 
         and order.split(',')[3] in ('PENDING_PAYMENT','PROCESSING')):
     print(order)
 


27585,2014-01-12 00:00:00.0,12431,PROCESSING
28244,2014-01-15 00:00:00.0,12431,PENDING_PAYMENT


In [184]:
orders_status =['PENDING_PAYMENT','PROCESSING']
get_custom_orders(orders,12431,'2014-01','PENDING_PAYMENT')

[]