<center>
    <font size=6>Hamster Matu 24h running-wheel dataset analysis</font>
</center>

Dataset metadata:
* 1 feature: Unix Epoch timestamp [ts] with millisecond [ms] precision
* Each observation = 1 wheel revolution
* Wheel radius = 10 cm

# I/O & EDA

In [1]:
import pandas as pd

#read the raw unix timestamp data into DF
df = pd.read_csv('matu_wheel_log.csv',
                     header=None,
                     names=['ts_unix'])
df.head(3)

Unnamed: 0,ts_unix
0,1636217711256
1,1636217712103
2,1636217713260


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12149 entries, 0 to 12148
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   ts_unix  12149 non-null  int64
dtypes: int64(1)
memory usage: 95.0 KB


* convert unix ts to datetime64 dtype

In [3]:
#convert ts_unix to human readale datetime timestamp ts
df["ts"] = df.ts_unix.astype('datetime64[ms]')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12149 entries, 0 to 12148
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   ts_unix  12149 non-null  int64         
 1   ts       12149 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 190.0 KB


In [4]:
df.head(3)

Unnamed: 0,ts_unix,ts
0,1636217711256,2021-11-06 16:55:11.256
1,1636217712103,2021-11-06 16:55:12.103
2,1636217713260,2021-11-06 16:55:13.260


* create <code>time_s</code> feature - time for one wheel revolution

In [5]:
#find times wheel made a full revolution in seconds
df['time_s'] = df.ts_unix.diff() / 1000

#fill missing values with 0
df['time_s'] = df.time_s.fillna(0)

df.head(2)

Unnamed: 0,ts_unix,ts,time_s
0,1636217711256,2021-11-06 16:55:11.256,0.0
1,1636217712103,2021-11-06 16:55:12.103,0.847


* calculate speed of each wheel revolution

In [6]:
import numpy as np

#knowing the distance of the each revolution is the
#circumference of the wheel C = 2pi r
wheel_radius_m = 0.1
C_m = 2 * np.pi * wheel_radius_m
print("Wheel circumference is",np.round(C_m,2), 'm')

#calculate the speed of each revolution
df['speed'] = C_m /  df['time_s']

#replace infinities with NaN-s
df['speed'] = df.speed.replace(np.inf, np.nan)

#convert NaN-s to 0
df['speed'] = df.speed.fillna(0)

df.head(3)

Wheel circumference is 0.63 m


Unnamed: 0,ts_unix,ts,time_s,speed
0,1636217711256,2021-11-06 16:55:11.256,0.0,0.0
1,1636217712103,2021-11-06 16:55:12.103,0.847,0.741816
2,1636217713260,2021-11-06 16:55:13.260,1.157,0.543058


## Outliers

In [7]:
df['speed'].describe().to_frame()

Unnamed: 0,speed
count,12149.0
mean,0.721567
std,0.361023
min,0.0
25%,0.577499
50%,0.776661
75%,0.892498
max,28.559933


* create a interactive plotting tool to inspect speeds

In [8]:
from bokeh.plotting import figure, show, output_notebook, output_file, save
from bokeh.models import ColumnDataSource, HoverTool, BoxSelectTool, \
    NumeralTickFormatter, Range1d, LinearAxis, Legend, DatetimeTickFormatter
from bokeh.layouts import column
from bokeh.io import export_png, reset_output
from bokeh.embed import file_html
from bokeh.resources import CDN
from bokeh import resources

In [9]:
#initialize plotting within jupyter notebook
output_notebook()

In [10]:
#create function to return time_series plot
def time_series(df, x, y, height=400, width=900):
    """Plots time series line plot."""
    
    p = figure(height=height,
               width=width,
               x_axis_type='datetime',
               title=x + ' vs ' + y,
               x_axis_label=x,
               y_axis_label=y)
    
    source = ColumnDataSource(df)
    
    hover = HoverTool(tooltips=[
        ('time', '@ts{%H:%M:%S}')
    ],
                      formatters={
                          '@ts': 'datetime'
                      },
                      mode='vline')
    
    p.line(x=x, y=y, alpha=0.75, color='navy', source=source)
    
    p.add_tools(hover)
    
    show(p)

In [11]:
#inspect data for outliers and sensor errors/noise
time_series(df=df, x='ts', y='speed')

* Everything before 2021-11-06 19:16:41 is clearly noise, equal to 0
* Everything after 2021-11-07 05:16:26 is clearly noise, equal to 0

In [12]:
#set ts as index
df = df.set_index('ts')

#manually supress noise before 19:15 and after 05:00
df.loc[:"2021-11-06 19:15", 'speed'] = 0
df.loc["2021-11-07 05":,'speed'] = 0

#reset index
df = df.reset_index()

#observe results
time_series(df=df, x='ts', y='speed')

In [13]:
#add some info to time_series function
def time_series(df, x, y, height=400, width=900):
    """Plots time series line plot."""
    
    p = figure(height=height,
               width=width,
               x_axis_type='datetime',
               title=x + ' vs ' + y,
               x_axis_label=x,
               y_axis_label=y)
    
    source = ColumnDataSource(df)
    
    hover = HoverTool(tooltips=[
        ('time', '@ts{%H:%M:%S}'),
        ('index', '$index')
    ],
                      formatters={
                          '@ts': 'datetime'
                      },
                      mode='vline')
    
    p.line(x=x, y=y, alpha=0.75, color='navy', source=source)
    
    p.add_tools(hover)
    
    show(p)

* Manually remove some obscure data points

In [14]:
time_series(df=df, x='ts', y='speed')

In [15]:
#index values where data is not reliable and can be supressed to 0
supress_index_list = [21, 1596, 3815, 3828, 4483, 4920, 6610, 9079, 11764, 12133]

df.loc[supress_index_list, 'speed'] = 0

time_series(df=df, x='ts', y='speed')#add some info to time_series function
def time_series(df, x, y, height=400, width=900):
    """Plots time series line plot."""
    
    p = figure(height=height,
               width=width,
               x_axis_type='datetime',
               title=x + ' vs ' + y,
               x_axis_label=x,
               y_axis_label=y)
    
    source = ColumnDataSource(df)
    
    hover = HoverTool(tooltips=[
        ('time', '@ts{%H:%M:%S}'),
        ('index', '$index')
    ],
                      formatters={
                          '@ts': 'datetime'
                      },
                      mode='vline')
    
    p.line(x=x, y=y, alpha=0.75, color='navy', source=source)
    
    p.add_tools(hover)
    
    show(p)

* Calculate acceleration feature <code>acc</code>

In [16]:
#calculate acceleration
df['acc'] = df['speed'].diff() / df['time_s'].diff().abs()

#fill missing values with 0
df['acc'] = df.acc.fillna(0)

df.head(3)

Unnamed: 0,ts,ts_unix,time_s,speed,acc
0,2021-11-06 16:55:11.256,1636217711256,0.0,0.0,0.0
1,2021-11-06 16:55:12.103,1636217712103,0.847,0.0,0.0
2,2021-11-06 16:55:13.260,1636217713260,1.157,0.0,0.0


# Smooth the data

* try different filters to remove some noise and make data more generalized

In [17]:
#define function that draws raw data and smoothened data
def double_time_series(df, x, y1, y2, height=400, width=900):
    
    p = figure(height=height,
               width=width,
               x_axis_type='datetime',
               title=x + ' vs ' + y1,
               x_axis_label=x,
               y_axis_label=y1)
    
    source = ColumnDataSource(df)
    
    hover = HoverTool(tooltips=[
        ('time', '@ts_datetime{%H:%M:%S}')
    ],
                      formatters={
                          '@ts_datetime': 'datetime'
                      },
                      mode='vline')
    
    p.line(x=x, y=y1, alpha=0.5, color='navy', source=source, legend_label=y1)
    p.line(x=x, y=y2, alpha=0.75, color='red', source=source, line_width=4, legend_label=y2)
    
    p.legend.location = 'top_right'
    p.legend.click_policy = 'hide'
    
    #p.add_tools(hover)
    
    show(p)

## lfilter

In [18]:
from scipy.signal import lfilter

n = 150 #larger = smoother curve
b = [1.0 / n] * n
a = 1

#smoothen speed and acc signal
df['speed_lfilter'] = lfilter(b, a, df['speed'])

double_time_series(df, 'ts', 'speed', 'speed_lfilter')

* Suppress smoothing where speed==0
* Do it manually via index search

In [19]:
time_series(df, 'ts', 'speed')

In [20]:
#manually found index values
index_to_suppress = [3815,3816,3817,5460,5461,5969,5970,9079,9080,11764,11765]
last_index = 12133

#supress smoothened values
df.loc[index_to_suppress, 'speed_lfilter'] = 0
df.loc[last_index:, 'speed_lfilter'] = 0

#plot the results
double_time_series(df, 'ts', 'speed', 'speed_lfilter')

* Plot smoothed speed with acc hover tooltip

In [21]:
#add some info to time_series function
def time_series2(df, x, y, height=400, width=900):
    """Plots time series line plot."""
    
    p = figure(height=height,
               width=width,
               x_axis_type='datetime',
               title=x + ' vs ' + y,
               x_axis_label=x,
               y_axis_label=y)
    
    source = ColumnDataSource(df)
    
    hover = HoverTool(tooltips=[
        ('v', '@'+y),
        ('time', '@ts{%H:%M:%S}'),
        ('a', '@acc'),
        ('index', '$index')
    ],
                      formatters={
                          '@ts': 'datetime'
                      },
                      mode='vline')
    
    p.line(x=x, y=y, alpha=0.75, color='navy', source=source, legend_label=y, line_width=2)
    
    p.add_tools(hover)
    
    show(p)

In [22]:
time_series2(df, 'ts', 'speed_lfilter')

# Descriptive stats

* Time related

In [23]:
#Total time measurements was run
dt_measurement = df.ts.iloc[-1] - df.ts[0]

#timedelta H,M,S components
dt_m_comp = dt_measurement.components
dt_measure = str(dt_m_comp.hours)+":"+str(dt_m_comp.minutes)+":0"+str(dt_m_comp.seconds)

#total measurement duration
dt_measure
print('Total measurement duration:',dt_measure)

Total measurement duration: 22:49:03


In [24]:
#total time hamster was actually running
test = df[df['speed_lfilter'] > 0]
time_series2(test, 'ts', 'speed_lfilter')

In [25]:
time_series2(df, 'ts', 'speed_lfilter')

In [26]:
#start and stop index values during running
run_edges = [[22,3815], [3817,5460], 
             [5461,5969], [5970, 9079],
             [9080, 11764], [11765, 12133]]

#find each run duration
run_tds = [df.ts.iloc[run[1]] - df.ts.iloc[run[0]] for run in run_edges]

#hamster runninf durations
print("Matu's running durations:")
run_tds

Matu's running durations:


[Timedelta('0 days 01:07:46.789000'),
 Timedelta('0 days 00:35:56.740000'),
 Timedelta('0 days 00:11:20.068000'),
 Timedelta('0 days 01:10:19.541000'),
 Timedelta('0 days 00:58:15.937000'),
 Timedelta('0 days 00:10:25.246000')]

In [27]:
import time

#total time ran
td_total = np.sum(run_tds)

total_run_time_sec = td_total.seconds
total_run_time_str = time.strftime("%H:%M:%S", time.gmtime(td_total.seconds))

#longest continuous run
longest_run_sec = np.max(run_tds).seconds
longest_run_str = time.strftime("%H:%M:%S", time.gmtime(longest_run_sec))

print("Total run time:", total_run_time_str)
print("Longest continuous session:", longest_run_str)

Total run time: 04:14:04
Longest continuous session: 01:10:19


* Distance related

In [28]:
#find total distance via integration
run_dist_smooth = np.trapz(df.speed_lfilter)
print("Total distance covered (Smooth Data): %d"% run_dist_smooth.round(), 'm')

run_dist_raw = np.trapz(df.speed).round()
print("Total distance covered (Raw Data): %d"% run_dist_raw.round(), 'm')

Total distance covered (Smooth Data): 8643 m
Total distance covered (Raw Data): 8692 m


* Speed related

In [29]:
df[df.speed > 0]["speed"].describe()

count    12103.000000
mean         0.718168
std          0.239035
min          0.000083
25%          0.578562
50%          0.777622
75%          0.891232
max          3.831211
Name: speed, dtype: float64

* acceleration

In [30]:
df[df.acc > 0]['acc'].describe()

count    5.916000e+03
mean     8.439863e-01
std      4.468352e-01
min      1.109291e-08
25%      4.882298e-01
50%      8.433400e-01
75%      1.179471e+00
max      5.242278e+00
Name: acc, dtype: float64

In [31]:
from bokeh.models.widgets import DataTable, TableColumn

data = {
    "Total Duration": "22:49:03",
    "Total Run Time": "04:14:04",
    "Longest Run": "01:10:19",
    "Total Distance [m]": "8643",
    "Avg Speed [m/s]": "0.72",
    "Max Speed [m/s]": "3.83",
    "Max Acceleration [m/s^2]": "5.24"
}

i_names = [k for k,w in data.items()]

df_tab = pd.DataFrame(data, index=["Values"])

cols = [TableColumn(field=col_name, title=col_name) for col_name in df_tab.columns]
dtable = DataTable(columns=cols, source=ColumnDataSource(df_tab), width=1000, height=100)

df_tab.transpose()

Unnamed: 0,Values
Total Duration,22:49:03
Total Run Time,04:14:04
Longest Run,01:10:19
Total Distance [m],8643
Avg Speed [m/s],0.72
Max Speed [m/s],3.83
Max Acceleration [m/s^2],5.24


In [32]:
cols = [TableColumn(field=col_name, title=col_name) for col_name in df_tab.columns]
dtable = DataTable(columns=cols, source=ColumnDataSource(df_tab), width=1000, height=100)

show(dtable)

# Create Plots

* Create plot to hover over

In [33]:
#define function that draws raw data and smoothened data
def select_time_series(df, x, y1, y2, height=400, width=900):
    
    p = figure(height=height,
               width=width,
               x_axis_type='datetime',
               title="Speed vs Time (select)",
               x_axis_label="Time",
               y_axis_label="Speed")
    
    source = ColumnDataSource(df)
    
    p.line(x=x, y=y1, alpha=0.5, color='navy', source=source, legend_label="Speed (raw) [m/s]")
    p.line(x=x, y=y2, alpha=0.75, color='red', source=source, line_width=2, legend_label="Speed (filtered) [m/s]")
    
    p.legend.location = 'top_right'
    p.legend.click_policy = 'hide'
    
    p.title.align = 'center'
    p.title.text_font_size="22pt"
    p.xaxis.axis_label_text_font_size="20pt"
    p.yaxis.axis_label_text_font_size="20pt"
    
    p.xaxis.formatter=DatetimeTickFormatter(hours='%H:%M')
    
    return p

In [34]:
select_time_series(df, 'ts', 'speed', 'speed_lfilter')

In [35]:
#add some info to time_series function
def hover_time_series(df, x, y, height=400, width=900):
    """Plots time series line plot."""
    
    p = figure(height=height,
               width=width,
               x_axis_type='datetime',
               title="Speed vs Time (hover)",
               x_axis_label="Time",
               y_axis_label="Speed")
    
    source = ColumnDataSource(df)
    
    hover = HoverTool(tooltips=[
        ('v', '@'+y),
        ('time', '@ts{%H:%M:%S}'),
        ('a', '@acc')
    ],
                      formatters={
                          '@ts': 'datetime'
                      },
                      mode='vline')
    
    p.line(x=x, y=y, line_width=1.5, 
           alpha=0.75, source=source, 
           legend_label="Speed (filtered) [m/s]",
           color='red')
    
    p.title.align = 'center'
    
    p.title.text_font_size="22pt"
    p.xaxis.axis_label_text_font_size="20pt"
    p.yaxis.axis_label_text_font_size="20pt"
    p.xaxis.formatter=DatetimeTickFormatter(hours='%H:%M')
    
    p.add_tools(hover)
    
    return p

In [37]:
hover_time_series(df, 'ts', 'speed_lfilter')

In [37]:
p_select = select_time_series(df, 'ts', 'speed', 'speed_lfilter')
p_hover = hover_time_series(df, 'ts', 'speed_lfilter')
p_dtable= dtable

model=column(p_select, p_hover)


In [42]:
p1 = figure(height=400,
           width=900,
           x_axis_type='datetime',
           title="Speed vs Time (select)",
           x_axis_label="Time",
           y_axis_label="Speed")

source1 = ColumnDataSource(df)
source2 = ColumnDataSource(df)

p1.line(x='ts', y='speed', alpha=0.5, color='navy', 
       source=source1, legend_label="Speed (raw) [m/s]")
p1.line(x='ts', y='speed_lfilter', alpha=0.75, color='red', 
       source=source2, line_width=2, legend_label="Speed (filtered) [m/s]")

p1.legend.location = 'top_right'
p1.legend.click_policy = 'hide'

p1.title.align = 'center'
p1.title.text_font_size="22pt"
p1.xaxis.axis_label_text_font_size="20pt"
p1.yaxis.axis_label_text_font_size="20pt"

p1.xaxis.formatter=DatetimeTickFormatter(hours='%H:%M')

##########################################################

p2 = figure(height=400,
           width=900,
           x_axis_type='datetime',
           title="Speed vs Time (hover)",
           x_axis_label="Time",
           y_axis_label="Speed")

source3 = ColumnDataSource(df)

hover = HoverTool(tooltips=[
    ('v', '@speed_lfilter'),
    ('time', '@ts{%H:%M:%S}'),
    ('a', '@acc')
],
                  formatters={
                      '@ts': 'datetime'
                  },
                  mode='vline')

p2.line(x='ts', y='speed_lfilter', line_width=1.5, 
       alpha=0.75, source=source3, 
       legend_label="Speed (filtered) [m/s]",
       color='red')

p2.title.align = 'center'
p2.title.text_font_size="22pt"
p2.xaxis.axis_label_text_font_size="20pt"
p2.yaxis.axis_label_text_font_size="20pt"
p2.xaxis.formatter=DatetimeTickFormatter(hours='%H:%M')

p2.add_tools(hover)

dict_list = {
    "Total Duration": "22:49:03",
    "Total Run Time": "04:14:04",
    "Longest Run": "01:10:19",
    "Total Distance [m]": "8643",
    "Avg Speed [m/s]": "0.72",
    "Max Speed [m/s]": "3.83",
    "Max Acceleration [m/s^2]": "5.24"
}

col_names = [k for k,w in data.items()]

df_table = pd.DataFrame(dict_list, index=["Values"])

column_names = [TableColumn(field=col_name, title=col_name) for col_name in df_table.columns]
bokeh_table = DataTable(columns=column_names, source=ColumnDataSource(df_table), 
                        width=900, height=100)

model = column(p1,p2,bokeh_table)

output_file(filename='test1.html', title='TEST')
save(model)

'/home/tonu/Documents/data_science/jupyter_anaconda/projects/matu/test1.html'