### Step 1. Loading the needed packages

In [17]:
import pandas as pd
import numpy as np
from datetime import datetime
from itertools import islice
import sys

### Step 2. Define functions

In [18]:
# define generator function to read in files in chuncks
def read_chunks(file_obj,chunk_size):
    while True:
        lines = list(islice(file_obj, chunk_size))
        if lines: 
            yield lines
        else:     
            print('end of file')
            break

### Step 3. Reload the previously defined function

In [19]:
# this function is the same as the previous one but we reload it here
def generate_data_frame(txt):
    session_time = []
    event_name = []
    event_attribute = []
    dtfmt ='%m/%d/%Y %I:%M:%S %p'
    for line in txt:
        s1=line.split(']')[0].strip('[')
        dt = datetime.strptime(s1, dtfmt)
        session_time.append(dt)
        s= line.split(']')[1].strip().split(':')
        event_name.append(s[0])
        if len(s) == 2:
            event_attribute.append(s[1].lstrip())
        else:
            event_attribute.append(np.nan)
    df = pd.DataFrame([session_time,event_name,event_attribute]).T
    df.columns=['session_time','event_name', 'event_attribute'] 
    return df

### Step 4. Open files and read data into a generator object

In [20]:
# open the file
f = open('../data/unstructured_example_log.txt','r')

In [21]:
# read file into chunks of 4 lines and create a generator object 
chunk_generator = read_chunks(f,4)

In [22]:
# check the memory usage of the generator object in bytes
sys.getsizeof(chunk_generator)

128

In [23]:
# get the first chunk using next()
next(chunk_generator)

['[5/15/2013 2:17:26 PM] Session Start\n',
 '[5/15/2013 2:17:26 PM] Leaving sequence: loadXML, moving forward.\n',
 '[5/15/2013 2:17:30 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:17:30 PM] Leaving sequence: InputNameScreen, moving forward.\n']

In [24]:
# create a new generator function called df 
df = (generate_data_frame(txt) for txt in chunk_generator)

In [26]:
# check the memory usage of the generator object in bytes
sys.getsizeof(df)

128

In [27]:
# get the first object in dfnext(df)
next(df)

Unnamed: 0,session_time,event_name,event_attribute
0,2013-05-15 14:17:31,Player submitted name,Carl
1,2013-05-15 14:17:31,Leaving sequence,"startScreen, moving forward."
2,2013-05-15 14:17:50,Player submitted name,Carl
3,2013-05-15 14:17:50,Leaving sequence,"slide2, moving forward."


In [28]:
f.close()