## <center> Parsing and restructuring unstructured process data </center>

### Step 1. Loading the needed packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

### Step 2. Open the unstructured data file and read it into a python list

In [2]:
with open('unstructured_example_log.txt') as f:
    txt = f.readlines()

In [3]:
txt

['[5/15/2013 2:17:26 PM] Session Start\n',
 '[5/15/2013 2:17:26 PM] Leaving sequence: loadXML, moving forward.\n',
 '[5/15/2013 2:17:30 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:17:30 PM] Leaving sequence: InputNameScreen, moving forward.\n',
 '[5/15/2013 2:17:31 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:17:31 PM] Leaving sequence: startScreen, moving forward.\n',
 '[5/15/2013 2:17:50 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:17:50 PM] Leaving sequence: slide2, moving forward.\n',
 '[5/15/2013 2:17:55 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:17:55 PM] Leaving sequence: slide2b, moving forward.\n',
 '[5/15/2013 2:18:34 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:18:34 PM] Leaving sequence: slide2c, moving forward.\n',
 '[5/15/2013 2:20:09 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:20:09 PM] Leaving sequence: slide3, moving forward.\n',
 '[5/15/2013 2:20:13 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:20:13 PM] Leaving sequence: s

In [4]:
txt[0][0]

'['

### Step 3. Clean each line to strip off the \n

In [5]:
txt = [t.strip() for t in txt]

In [6]:
txt

['[5/15/2013 2:17:26 PM] Session Start',
 '[5/15/2013 2:17:26 PM] Leaving sequence: loadXML, moving forward.',
 '[5/15/2013 2:17:30 PM] Player submitted name: Carl',
 '[5/15/2013 2:17:30 PM] Leaving sequence: InputNameScreen, moving forward.',
 '[5/15/2013 2:17:31 PM] Player submitted name: Carl',
 '[5/15/2013 2:17:31 PM] Leaving sequence: startScreen, moving forward.',
 '[5/15/2013 2:17:50 PM] Player submitted name: Carl',
 '[5/15/2013 2:17:50 PM] Leaving sequence: slide2, moving forward.',
 '[5/15/2013 2:17:55 PM] Player submitted name: Carl',
 '[5/15/2013 2:17:55 PM] Leaving sequence: slide2b, moving forward.',
 '[5/15/2013 2:18:34 PM] Player submitted name: Carl',
 '[5/15/2013 2:18:34 PM] Leaving sequence: slide2c, moving forward.',
 '[5/15/2013 2:20:09 PM] Player submitted name: Carl',
 '[5/15/2013 2:20:09 PM] Leaving sequence: slide3, moving forward.',
 '[5/15/2013 2:20:13 PM] Player submitted name: Carl',
 '[5/15/2013 2:20:13 PM] Leaving sequence: slide4, moving forward.']

### Step 4. Sepate the time stamp and convert it to standard Python datetime object

In [7]:
s0 = txt[0].split(']')[0].strip('[')
s10 = txt[10].split(']')[0].strip('[')

In [8]:
s0

'5/15/2013 2:17:26 PM'

In [9]:
s10

'5/15/2013 2:18:34 PM'

In [10]:
dtfmt ='%m/%d/%Y %I:%M:%S %p'   # %H -> 24 hours, %I-> 12 hours
t0 = datetime.strptime(s0, dtfmt)
t10 = datetime.strptime(s10, dtfmt)

In [11]:
t0

datetime.datetime(2013, 5, 15, 14, 17, 26)

In [12]:
t10

datetime.datetime(2013, 5, 15, 14, 18, 34)

In [13]:
(t10-t0).seconds

68

### Step 5. Restructure the information into a data frame

In [14]:
# define a function to combine all the above steps and turn the results into a pandas Data Frame

def generate_data_frame(txt):
    session_time = []
    event_name = []
    event_attribute = []
    dtfmt ='%m/%d/%Y %I:%M:%S %p'
    for line in txt:
        s1=line.split(']')[0].strip('[')
        dt = datetime.strptime(s1, dtfmt)
        session_time.append(dt)
        s= line.split(']')[1].strip().split(':')
        event_name.append(s[0])
        if len(s) == 2:
            event_attribute.append(s[1].lstrip())
        else:
            event_attribute.append(np.nan)
    df = pd.DataFrame([session_time,event_name,event_attribute]).T
    df.columns=['session_time','event_name', 'event_attribute'] 
    return df

In [15]:
generate_data_frame(txt)

Unnamed: 0,session_time,event_name,event_attribute
0,2013-05-15 14:17:26,Session Start,
1,2013-05-15 14:17:26,Leaving sequence,"loadXML, moving forward."
2,2013-05-15 14:17:30,Player submitted name,Carl
3,2013-05-15 14:17:30,Leaving sequence,"InputNameScreen, moving forward."
4,2013-05-15 14:17:31,Player submitted name,Carl
5,2013-05-15 14:17:31,Leaving sequence,"startScreen, moving forward."
6,2013-05-15 14:17:50,Player submitted name,Carl
7,2013-05-15 14:17:50,Leaving sequence,"slide2, moving forward."
8,2013-05-15 14:17:55,Player submitted name,Carl
9,2013-05-15 14:17:55,Leaving sequence,"slide2b, moving forward."
