In [109]:
from datetime import datetime

## McStatus

In [125]:
msRDD = sc.textFile('McStatus.csv')

In [126]:
header = msRDD.first()

header

'McID,StatusID,TimeStamp'

In [127]:
msRDD = msRDD.filter(lambda x: x!=header)
msRDD.first()

'Machine_4,3,2018-03-11T16:13:51.180Z'

In [113]:
def format_date(d):
    l = d.split('T')
    date = l[0]
    time = l[1]
    date_list = date.split('-')
    time_list = time.split('Z')[0].split(':')
    seconds_milliseconds = time_list[2].split('.')
    del time_list[-1]
    
    integer_map = map(int, date_list + time_list + seconds_milliseconds)
    date_list = list(integer_map)
    
    return datetime(*date_list)

### dictionary

In [128]:
msRDD = msRDD.map(lambda x: x.split(','))\
    .map(lambda x: {
        'McID' : x[0],
        'TimeStamp' : x[2],
        'McStatus' : x[1]
    })
msRDD.first()

{'McID': 'Machine_4', 'TimeStamp': '2018-03-11T16:13:51.180Z', 'McStatus': '3'}

In [129]:
pmsRDD = msRDD.map(lambda x: (x['McID'], (x['TimeStamp'], x['McStatus'])))\
    .mapValues(lambda x: (format_date(x[0]), x[1]))
pmsRDD.first()

('Machine_4', (datetime.datetime(2018, 3, 11, 16, 13, 51, 180), '3'))

### oldest and newest

In [116]:
oldest = pmsRDD.reduce(lambda x,y: x if x[1][0] < y[1][0] else y)
newest = pmsRDD.reduce(lambda x,y: x if x[1][0] > y[1][0] else y)
print("Oldest: ", oldest[0], oldest[1][0].strftime("%d/%m/%Y, %H:%M:%S"))
print("Newest: ", newest[0], newest[1][0].strftime("%d/%m/%Y, %H:%M:%S"))

Oldest:  Machine_2 01/01/2018, 00:16:26
Newest:  Machine_1 10/04/2018, 06:54:13


### keep only data from 15 of Jannuary 2018

In [117]:
pmsRDD.filter(lambda x: x[1][0] > datetime(2018, 1, 15)).count()

86931

### how many events per machine

In [118]:
l = pmsRDD.map(lambda x: (x[0], 1))\
    .reduceByKey(lambda x, y: x+y)\
    .collect()
for elem in l:
    print(elem)

('Machine_2', 30836)
('Machine_5', 4803)
('Machine_4', 2137)
('Machine_1', 26761)
('Machine_3', 5506)
('Machine_0', 27595)


### duration of each event as the difference between the actual event and the one right after

In [119]:
shortRDD = sc.parallelize(pmsRDD.take(2000))
shortRDD.join(shortRDD)\
    .map(lambda x: ((x[0], x[1][0][0]), x[1][1][0]))\
    .filter(lambda x: x[1]> x[0][1])\
    .map(lambda x: ((x[0][0], x[0][1]), x[1]- x[0][1]))\
    .reduceByKey(lambda x, y: x if x < y else y)\
    .take(3)

[(('Machine_5', datetime.datetime(2018, 3, 14, 18, 1, 34, 287)),
  datetime.timedelta(seconds=1765, microseconds=999896)),
 (('Machine_5', datetime.datetime(2018, 3, 14, 22, 52, 52, 703)),
  datetime.timedelta(seconds=901, microseconds=999594)),
 (('Machine_5', datetime.datetime(2018, 3, 15, 0, 12, 49, 357)),
  datetime.timedelta(seconds=304, microseconds=363))]

## STATUS NAME

In [120]:
snRDD = sc.textFile('statusName.csv')

In [121]:
header = snRDD.first()
header

'StatusID,StatusName,StatusType,Timestamp'

### dictionary

In [122]:
snRDD = snRDD.filter(lambda x: x!=header)
snRDD = snRDD.map(lambda x: x.split(','))\
    .map(lambda x: {
    'StatusID': x[0],
    'StatusName': x[1],
    'StatusType': x[2],
    'Timestamp': x[3]
})
snRDD.take(3)

[{'StatusID': '14',
  'StatusName': '14-MancanzaMaterDaFornit',
  'StatusType': '1',
  'Timestamp': '2019-01-14T22:58:31.579Z'},
 {'StatusID': '21',
  'StatusName': '21-Guasto Etic/LaserMark',
  'StatusType': '1',
  'Timestamp': '2019-01-14T22:58:32.032Z'},
 {'StatusID': '23',
  'StatusName': '23-GuastoAOIpostserigr3D',
  'StatusType': '1',
  'Timestamp': '2019-01-14T22:58:32.032Z'}]

## join

In [130]:
newMsRDD = pmsRDD.map(lambda x: (x[1][1], (x[0], x[1][0])))
pmsRDD.first()

('Machine_4', (datetime.datetime(2018, 3, 11, 16, 13, 51, 180), '3'))

In [None]:
newSnRDD = snRDD.map(lambda x: (x['StatusID'], (x['StatusName'], x['StatusType'], x['Timestamp'])))\
                .mapValues(lambda x: (x[0], x[1], format_date(x[2])))

#joinedRDD = newMsRDD.join(newSnRDD)

newMsRDD.first()