# NCME 2023 Data Science Upskilling Workshop - Session 3

# Data Wrangling

Questions contact: jhao@ets.org


# Parsing unstructured data

### step 1. load packages

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

### step 2. open the file

In [None]:
with open('data_for_session_3/unstructured_example_log.txt') as f:
    txt = f.readlines()

In [None]:
txt

In [None]:
txt[0].strip()

### step 3. clean the lines by stripping off the \n

In [None]:
txt = [t.strip() for t in txt]

In [None]:
txt[0]

In [None]:
txt[0].split(']')[0].replace('[','')

### step 4. chuncking and formatting

In [None]:
txt[0].split(']')[0].replace('[','')

In [None]:
txt[0].split(']')[1]

In [None]:
txt[0].split(']')[1].upper()

### step 5. datetime conversion

In [None]:
s = txt[0].split(']')[0].strip('[')

In [None]:
s

In [None]:
dtfmt ='%m/%d/%Y %I:%M:%S %p'   # %H -> 24 hours, %I-> 12 hours

In [None]:
dt = datetime.strptime(s, dtfmt)

In [None]:
dt

### step 6. convert into a dataframe

In [None]:
col1 = []
col2 = []
col3 = []

In [None]:
for line in txt:
    s1=line.split(']')[0].strip('[')
    dt = datetime.strptime(s1, dtfmt)
    col1.append(dt)
    s= line.split(']')[1].strip().split(':')
    col2.append(s[0])
    if len(s) == 2:
        col3.append(s[1])
    else:
        col3.append(np.nan)

In [None]:
df = pd.DataFrame([col1,col2,col3]).T
df.columns=['datetime','event_name', 'event_result']

In [None]:
df

In [None]:
df.datetime - df.datetime.iloc[0]

### step 7. convert to seconds

In [None]:
(df.datetime-df.datetime.loc[0]).apply(lambda x: x.total_seconds())

In [None]:
df['time_seconds'] = (df.datetime-df.datetime.loc[0]).apply(lambda x: x.total_seconds())

In [None]:
df

## Parsing JSON file

### Step 1. Loading the needed packages

In [None]:
import pandas as pd
import json
from datetime import datetime

### Step 2. Open the unstructured data file and read it into a Python dictionary

In [None]:
with open('data_for_session_3/structured_example_log.json') as f:
    txt = json.load(f)

In [None]:
pd.DataFrame(txt.get('data'))

### Step 3. Convert a list of dictionaries to data frame

In [None]:
pd.DataFrame(txt.get('data'))

## Parsing XML file

### Step 1. Load needed packages

In [None]:
import xml.etree.ElementTree as et  # package for xml parsing

### Step 2. Specify XML file name

In [None]:
xml_file_name = 'data_for_session_3/structured_example_log.xml'  # note that this xml is structured to comply with ETS' VPA data model

### Step 3. Parsing the xml tree

In [None]:
tree = et.parse(xml_file_name)
root = tree.getroot()

In [None]:
root

In [None]:
# check how many child 
len(root)

In [None]:
root[0]

In [None]:
# find out the child of the root and the number of grandchild
for chd in root[0]:
    print(chd.tag,',',chd.text, ',',len(chd))

In [None]:
# check the playerID child 
root[0][2]

In [None]:
# check the child of playerID
for chd in root[0][2]:
    print(chd.tag,',',chd.text, ',',len(chd))

In [None]:
# check the child of the child of playerID
root[0][2][0]

In [None]:
# check the eventSequence child
root[0][5]

In [None]:
for chd in root[0][5]:
    print(chd.tag,',',chd.text, ',',len(chd))

In [None]:
# check the first event
root[0][5][0]

In [None]:
for chd in root[0][5][0]:
    print(chd.tag,',',chd.text, ',',len(chd))

## The above codes show how you reach out to each leaf in the XML tree. You can put them together into a dataframe. 

## Visualization - Plotly express

In [None]:
import plotly.express as px
#import pandas as pd

In [None]:
# load data 
df = px.data.iris()

In [None]:
df

In [None]:
# scatter plot

px.scatter(df, x="sepal_width", y="sepal_length", color="species", size='species_id')

In [None]:
# scatter plot 3D
px.scatter_3d(df, x="sepal_width", y="sepal_length", z="petal_length",color="species", size='species_id')

In [None]:
# distribution
px.histogram(df,x='sepal_length',color='species')

In [None]:
# distribution in facets
px.histogram(df,x='sepal_length',color='species',facet_col='species')

In [None]:
# barplot

px.bar(df, y='sepal_length',color='species',facet_row='species')

In [None]:
# scatter plot with trend line and marginalization

px.scatter(df, x="sepal_width", y="sepal_length", color="species", marginal_y="violin", \
           marginal_x="box", trendline="ols")

In [None]:
# scatter matrix

px.scatter_matrix(df, dimensions=["sepal_width", "sepal_length", "petal_width", "petal_length"], color="species")

In [None]:
# animation - using a different dataset

df = px.data.gapminder()

In [None]:
df.head()

In [None]:
px.scatter(df.query("year==2007"), x="gdpPercap", y="lifeExp", size="pop", color="continent",
           hover_name="country", log_x=True, size_max=60)

In [None]:
px.scatter(df, x="gdpPercap", y="lifeExp", size="pop", color="continent",
           hover_name="country", log_x=True, size_max=60,animation_frame='year',range_y=[20,100])


## Ipywidgets for interaction

In [None]:
from ipywidgets import interact, fixed, widgets
from IPython.display import display

In [None]:
# let's use the GDP data, but you want to choose the contenient

def continent_gdp(df,conti):
    return px.scatter(df.query('continent==@conti'), x="gdpPercap", y="lifeExp", size="pop", color="country",
           hover_name="country", log_x=True, size_max=60,animation_frame='year',range_y=[20,100])


In [None]:
continent_list = df.continent.unique().tolist()

In [None]:
continent_list

In [None]:
result = interact(continent_gdp, df=fixed(df), conti=continent_list)

### Some other widgets

In [None]:
w = widgets.IntSlider()
display(w)

In [None]:
w.value

In [None]:
# now make it more complex
w = widgets.FloatSlider(
    value=7.5,
    min=0,
    max=10.0,
    step=0.1,
    description='Test:',
    disabled=False,
    continuous_update=False,
    orientation='vertical',
    readout=True,
    readout_format='.1f',
)

In [None]:
w

In [None]:
w.value

In [None]:
# check box widgets

w = widgets.Checkbox(
    value=False,
    description='Check me',
    disabled=False,
    indent=False
)

In [None]:
w

In [None]:
w.value

In [None]:
# multiple selection
w = widgets.SelectMultiple(
    options=['Apples', 'Oranges', 'Pears','water mellon'],
    value=['Oranges'],
    #rows=10,
    description='Fruits',
    disabled=False
)

In [None]:
w

In [None]:
w.value

In [None]:
# radio buttons

w = widgets.RadioButtons(
    options=['pepperoni', 'pineapple', 'anchovies'],
#    value='pineapple', # Defaults to 'pineapple'
#    layout={'width': 'max-content'}, # If the items' names are long
    description='Pizza topping:',
    disabled=False
)

In [None]:
w

In [None]:
w.value

In [None]:
# now using the radio button for the previous GDP plot

selector = widgets.RadioButtons(
    options=['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'],
    description='Continent:',
    disabled=False
)

In [None]:
result = interact(continent_gdp, df=fixed(df), conti=selector)