# NCME 2022 Data Science Upskilling Workshop - Session 3

# Data Wrangling

Questions contact: jhao@ets.org


# Parsing unstructured data

### step 1. load packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

### step 2. open the file

In [2]:
with open('data_for_session_3/unstructured_example_log.txt') as f:
    txt = f.readlines()

In [5]:
txt

['[5/15/2013 2:17:26 PM] Session Start\n',
 '[5/15/2013 2:17:26 PM] Leaving sequence: loadXML, moving forward.\n',
 '[5/15/2013 2:17:30 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:17:30 PM] Leaving sequence: InputNameScreen, moving forward.\n',
 '[5/15/2013 2:17:31 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:17:31 PM] Leaving sequence: startScreen, moving forward.\n',
 '[5/15/2013 2:17:50 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:17:50 PM] Leaving sequence: slide2, moving forward.\n',
 '[5/15/2013 2:17:55 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:17:55 PM] Leaving sequence: slide2b, moving forward.\n',
 '[5/15/2013 2:18:34 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:18:34 PM] Leaving sequence: slide2c, moving forward.\n',
 '[5/15/2013 2:20:09 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:20:09 PM] Leaving sequence: slide3, moving forward.\n',
 '[5/15/2013 2:20:13 PM] Player submitted name: Carl\n',
 '[5/15/2013 2:20:13 PM] Leaving sequence: s

In [10]:
txt[0].strip()

'[5/15/2013 2:17:26 PM] Session Start'

### step 3. clean the lines by stripping off the \n

In [11]:
txt = [t.strip() for t in txt]

In [15]:
txt[0]

'[5/15/2013 2:17:26 PM] Session Start'

In [20]:
txt[0].split(']')[0].replace('[','')

'5/15/2013 2:17:26 PM'

### step 4. chuncking and formatting

In [17]:
txt[0].split(']')[0].replace('[','')

'5/15/2013 2:17:26 PM'

In [21]:
txt[0].split(']')[1]

' Session Start'

In [22]:
txt[0].split(']')[1].upper()

' SESSION START'

### step 5. datetime conversion

In [23]:
s = txt[0].split(']')[0].strip('[')

In [24]:
s

'5/15/2013 2:17:26 PM'

In [25]:
dtfmt ='%m/%d/%Y %I:%M:%S %p'   # %H -> 24 hours, %I-> 12 hours

In [26]:
dt = datetime.strptime(s, dtfmt)

In [27]:
dt

datetime.datetime(2013, 5, 15, 14, 17, 26)

### step 6. convert into a dataframe

In [29]:
col1 = []
col2 = []
col3 = []

In [30]:
for line in txt:
    s1=line.split(']')[0].strip('[')
    dt = datetime.strptime(s1, dtfmt)
    col1.append(dt)
    s= line.split(']')[1].strip().split(':')
    col2.append(s[0])
    if len(s) == 2:
        col3.append(s[1])
    else:
        col3.append(np.nan)

In [31]:
df = pd.DataFrame([col1,col2,col3]).T
df.columns=['datetime','event_name', 'event_result']

In [32]:
df

Unnamed: 0,datetime,event_name,event_result
0,2013-05-15 14:17:26,Session Start,
1,2013-05-15 14:17:26,Leaving sequence,"loadXML, moving forward."
2,2013-05-15 14:17:30,Player submitted name,Carl
3,2013-05-15 14:17:30,Leaving sequence,"InputNameScreen, moving forward."
4,2013-05-15 14:17:31,Player submitted name,Carl
5,2013-05-15 14:17:31,Leaving sequence,"startScreen, moving forward."
6,2013-05-15 14:17:50,Player submitted name,Carl
7,2013-05-15 14:17:50,Leaving sequence,"slide2, moving forward."
8,2013-05-15 14:17:55,Player submitted name,Carl
9,2013-05-15 14:17:55,Leaving sequence,"slide2b, moving forward."


In [35]:
df.datetime - df.datetime.iloc[0]

0    0 days 00:00:00
1    0 days 00:00:00
2    0 days 00:00:04
3    0 days 00:00:04
4    0 days 00:00:05
5    0 days 00:00:05
6    0 days 00:00:24
7    0 days 00:00:24
8    0 days 00:00:29
9    0 days 00:00:29
10   0 days 00:01:08
11   0 days 00:01:08
12   0 days 00:02:43
13   0 days 00:02:43
14   0 days 00:02:47
15   0 days 00:02:47
Name: datetime, dtype: timedelta64[ns]

### step 7. convert to seconds

In [36]:
(df.datetime-df.datetime.loc[0]).apply(lambda x: x.total_seconds())

0       0.0
1       0.0
2       4.0
3       4.0
4       5.0
5       5.0
6      24.0
7      24.0
8      29.0
9      29.0
10     68.0
11     68.0
12    163.0
13    163.0
14    167.0
15    167.0
Name: datetime, dtype: float64

In [37]:
df['time_seconds'] = (df.datetime-df.datetime.loc[0]).apply(lambda x: x.total_seconds())

In [38]:
df

Unnamed: 0,datetime,event_name,event_result,time_seconds
0,2013-05-15 14:17:26,Session Start,,0.0
1,2013-05-15 14:17:26,Leaving sequence,"loadXML, moving forward.",0.0
2,2013-05-15 14:17:30,Player submitted name,Carl,4.0
3,2013-05-15 14:17:30,Leaving sequence,"InputNameScreen, moving forward.",4.0
4,2013-05-15 14:17:31,Player submitted name,Carl,5.0
5,2013-05-15 14:17:31,Leaving sequence,"startScreen, moving forward.",5.0
6,2013-05-15 14:17:50,Player submitted name,Carl,24.0
7,2013-05-15 14:17:50,Leaving sequence,"slide2, moving forward.",24.0
8,2013-05-15 14:17:55,Player submitted name,Carl,29.0
9,2013-05-15 14:17:55,Leaving sequence,"slide2b, moving forward.",29.0


## Parsing JSON file

### Step 1. Loading the needed packages

In [39]:
import pandas as pd
import json
from datetime import datetime

### Step 2. Open the unstructured data file and read it into a Python dictionary

In [40]:
with open('data_for_session_3/structured_example_log.json') as f:
    txt = json.load(f)

In [48]:
pd.DataFrame(txt.get('data'))

Unnamed: 0,session_time,event_name,event_attribute
0,2013-05-15 14:17:26,Session Start,
1,2013-05-15 14:17:26,Leaving sequence,"loadXML, moving forward."
2,2013-05-15 14:17:30,Player submitted name,Carl
3,2013-05-15 14:17:30,Leaving sequence,"InputNameScreen, moving forward."
4,2013-05-15 14:17:31,Player submitted name,Carl
5,2013-05-15 14:17:31,Leaving sequence,"startScreen, moving forward."
6,2013-05-15 14:17:50,Player submitted name,Carl
7,2013-05-15 14:17:50,Leaving sequence,"slide2, moving forward."
8,2013-05-15 14:17:55,Player submitted name,Carl
9,2013-05-15 14:17:55,Leaving sequence,"slide2b, moving forward."


### Step 3. Convert a list of dictionaries to data frame

In [None]:
pd.DataFrame(txt.get('data'))

## Parsing XML file

### Step 1. Load needed packages

In [49]:
import xml.etree.ElementTree as et  # package for xml parsing

### Step 2. Specify XML file name

In [50]:
xml_file_name = 'data_for_session_3/structured_example_log.xml'  # note that this xml is structured to comply with ETS' VPA data model

### Step 3. Parsing the xml tree

In [51]:
tree = et.parse(xml_file_name)
root = tree.getroot()

In [52]:
root

<Element 'gameLog' at 0x183c39810>

In [53]:
# check how many child 
len(root)

1

In [54]:
root[0]

<Element 'session' at 0x183c39e50>

In [55]:
# find out the child of the root and the number of grandchild
for chd in root[0]:
    print(chd.tag,',',chd.text, ',',len(chd))

sessionID , 7369 , 0
teamID , hao_jiangang , 0
playerID , None , 4
attemptID , 17 , 0
sessionExtData , None , 5
eventSequence , None , 45


In [56]:
# check the playerID child 
root[0][2]

<Element 'playerID' at 0x183cab220>

In [57]:
# check the child of playerID
for chd in root[0][2]:
    print(chd.tag,',',chd.text, ',',len(chd))

pair , None , 2
pair , None , 2
pair , None , 2
pair , None , 2


In [59]:
# check the child of the child of playerID
root[0][2][0]

<Element 'pair' at 0x183cab270>

In [61]:
# check the eventSequence child
root[0][5]

<Element 'eventSequence' at 0x183cabbd0>

In [62]:
for chd in root[0][5]:
    print(chd.tag,',',chd.text, ',',len(chd))

event , None , 7
event , None , 8
event , None , 8
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 8
event , None , 8
event , None , 7
event , None , 7
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 7
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8


In [63]:
# check the first event
root[0][5][0]

<Element 'event' at 0x183cabc20>

In [64]:
for chd in root[0][5][0]:
    print(chd.tag,',',chd.text, ',',len(chd))

eventName , chat , 0
eventStartTime , 2019-11-06T14:18:31Z , 0
eventEndTime , 2019-11-06T14:18:31Z , 0
eventBy , jiangang , 0
eventTo , others , 0
eventResult , hi , 0
eventLocation , slide1-step0 , 0


## The above codes show how you reach out to each leaf in the XML tree. You can put them together into a dataframe. 

## Visualization - Plotly express

In [None]:
import plotly.express as px
#import pandas as pd

In [None]:
# load data 
df = px.data.iris()

In [None]:
df

In [None]:
# scatter plot

px.scatter(df, x="sepal_width", y="sepal_length", color="species", size='species_id')

In [None]:
# scatter plot 3D
px.scatter_3d(df, x="sepal_width", y="sepal_length", z="petal_length",color="species", size='species_id')

In [None]:
# distribution
px.histogram(df,x='sepal_length',color='species')

In [None]:
# distribution in facets
px.histogram(df,x='sepal_length',color='species',facet_col='species')

In [None]:
# barplot

px.bar(df, y='sepal_length',color='species',facet_row='species')

In [None]:
# scatter plot with trend line and marginalization

px.scatter(df, x="sepal_width", y="sepal_length", color="species", marginal_y="violin", \
           marginal_x="box", trendline="ols")

In [None]:
# scatter matrix

px.scatter_matrix(df, dimensions=["sepal_width", "sepal_length", "petal_width", "petal_length"], color="species")

In [None]:
# animation - using a different dataset

df = px.data.gapminder()

In [None]:
df.head()

In [None]:
px.scatter(df.query("year==2007"), x="gdpPercap", y="lifeExp", size="pop", color="continent",
           hover_name="country", log_x=True, size_max=60)

In [None]:
px.scatter(df, x="gdpPercap", y="lifeExp", size="pop", color="continent",
           hover_name="country", log_x=True, size_max=60,animation_frame='year',range_y=[20,100])


## Ipywidgets for interaction

In [None]:
from ipywidgets import interact, fixed, widgets
from IPython.display import display

In [None]:
# let's use the GDP data, but you want to choose the contenient

def continent_gdp(df,conti):
    return px.scatter(df.query('continent==@conti'), x="gdpPercap", y="lifeExp", size="pop", color="country",
           hover_name="country", log_x=True, size_max=60,animation_frame='year',range_y=[20,100])


In [None]:
continent_list = df.continent.unique().tolist()

In [None]:
continent_list

In [None]:
result = interact(continent_gdp, df=fixed(df), conti=continent_list)

### Some other widgets

In [None]:
w = widgets.IntSlider()
display(w)

In [None]:
w.value

In [None]:
# now make it more complex
w = widgets.FloatSlider(
    value=7.5,
    min=0,
    max=10.0,
    step=0.1,
    description='Test:',
    disabled=False,
    continuous_update=False,
    orientation='vertical',
    readout=True,
    readout_format='.1f',
)

In [None]:
w

In [None]:
w.value

In [None]:
# check box widgets

w = widgets.Checkbox(
    value=False,
    description='Check me',
    disabled=False,
    indent=False
)

In [None]:
w

In [None]:
w.value

In [None]:
# multiple selection
w = widgets.SelectMultiple(
    options=['Apples', 'Oranges', 'Pears','water mellon'],
    value=['Oranges'],
    #rows=10,
    description='Fruits',
    disabled=False
)

In [None]:
w

In [None]:
w.value

In [None]:
# radio buttons

w = widgets.RadioButtons(
    options=['pepperoni', 'pineapple', 'anchovies'],
#    value='pineapple', # Defaults to 'pineapple'
#    layout={'width': 'max-content'}, # If the items' names are long
    description='Pizza topping:',
    disabled=False
)

In [None]:
w

In [None]:
w.value

In [None]:
# now using the radio button for the previous GDP plot

selector = widgets.RadioButtons(
    options=['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'],
    description='Continent:',
    disabled=False
)

In [None]:
result = interact(continent_gdp, df=fixed(df), conti=selector)