In [170]:
import pandas as pd
import numpy as np
from plotly import graph_objs as go
from plotly.offline import plot
from uncertainties import ufloat
import datetime

# How does our lab collect data?

Here was a small Python project that I thought of - are there trends in the rate of data collection in our lab at the CfA? From a qualitative sense, it always felt that when visitors come, several come at once and one would expect this would reflect in the number of scans produced in a small period of time.

Another question I'd like to ask is how long do we typically accumulate data for? This is reflected in the number of "shots", i.e. the number of accumulations at a repetition rate of 5 Hz (typically).

Finally, what are the most common frequencies the spectrometers are tuned to.

I have to state that I'm not sure what I'll find - this is mainly an excercise in Python (Pandas/Plotly)

In [28]:
ft1_df = pd.read_pickle("../data/FTM1_scans.pkl")
ft2_df = pd.read_pickle("../data/FTM2_scans.pkl")

In [29]:
# Convert the datetime handling into numpy format
for df in [ft1_df, ft2_df]:
    df["date"] = df["date"].astype("datetime64")

Simple statistics behind the data collection, I'll be using FT1, and also exclude the last row (which is 2019).

In [140]:
yearly = ft1_df.groupby([ft1_df["date"].dt.year])

Average number of scans per year

In [153]:
scans = ufloat(
    np.average(yearly["shots"].describe()["count"].iloc[:-1]),
    np.std(yearly["shots"].describe()["count"].iloc[:-1])
)

In [154]:
scans

365026.8+/-145446.1955149051

In [155]:
shots = ufloat(
    np.average(yearly["shots"].describe()["mean"].iloc[:-1]),
    np.std(yearly["shots"].describe()["mean"].iloc[:-1])
)

In [156]:
shots

50.173264275254574+/-7.931621268027359

Convert this to time spent per year in days

In [161]:
((shots / 5.) * scans) / 60. / 60. / 24.

42.394875240626156+/-18.17330959504969

What's the actual number of shots in a year?

In [166]:
actual_shots = ufloat(
    np.average(yearly.sum()["shots"].iloc[:-1]),
    np.std(yearly.sum()["shots"].iloc[:-1])
)

In [167]:
actual_shots

17232749.4+/-5560414.872954917

In [169]:
(actual_shots / 5. / 60.) / 60. / 24.

39.89062361111111+/-12.871330724432676

So approximately, the experiments are taking data only for 42 days a year total. Of course, this doesn't reflect reality (you spend most of the time trying to make the experiment work the way you want to of course). I'm also curious how this compares with other labs...

In [45]:
# Bin all of the data into year, month, and day
grouped_dfs = [
    df.groupby([df["date"].dt.year, df["date"].dt.month, df["date"].dt.day]).count() for df in [ft1_df, ft2_df]
]

In [46]:
for df in grouped_dfs:
    df["cumulative"] = np.cumsum(df["id"])

In [48]:
flattened_dfs = [
    df.set_index(df.index.map(lambda t: pd.datetime(*t))) for df in grouped_dfs
]

In [179]:
layout = {
        "height": 600.,
        "yaxis": {
            "title": "Number of scans",
        },
        "xaxis": {
            "title": "Time"
        },
        "title": "How we collect data",
        "showlegend": True,
        "legend": {
            "x": 0.1,
            "y": 0.95
        }
    }

fig = go.FigureWidget(layout=layout)

traces = [
    fig.add_scattergl(x=df.index, y=df["cumulative"], name=name) for df, name in zip(flattened_dfs, ["FT1", "FT2"])
]

isms_times = [datetime.datetime(year=year, month=6, day=17) for year in [2014, 2015, 2016, 2017, 2018]]

fig.add_bar(
    x=isms_times,
    y=[2e6] * len(isms_times),
    width=2e6,
    hoverinfo="name",
    name="ISMS"
)

fig

FigureWidget({
    'data': [{'name': 'FT1',
              'type': 'scattergl',
              'uid': 'aafbe74d-…

In [180]:
print(plot(fig, show_link=False, link_text="", output_type="div", include_plotlyjs=False))

<div><div id="ed88ed28-c295-4182-9c04-629162ca1a83" style="height: 600.0px; width: 100%;" class="plotly-graph-div"></div><script type="text/javascript">window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL="https://plot.ly";Plotly.newPlot("ed88ed28-c295-4182-9c04-629162ca1a83", [{"name": "FT1", "x": ["2014-07-08 04:00:00", "2014-07-09 04:00:00", "2014-07-10 04:00:00", "2014-07-11 04:00:00", "2014-07-13 04:00:00", "2014-07-14 04:00:00", "2014-07-15 04:00:00", "2014-07-16 04:00:00", "2014-07-17 04:00:00", "2014-07-18 04:00:00", "2014-07-21 04:00:00", "2014-07-22 04:00:00", "2014-07-23 04:00:00", "2014-07-25 04:00:00", "2014-07-28 04:00:00", "2014-07-29 04:00:00", "2014-07-30 04:00:00", "2014-07-31 04:00:00", "2014-08-01 04:00:00", "2014-08-05 04:00:00", "2014-08-06 04:00:00", "2014-08-07 04:00:00", "2014-08-11 04:00:00", "2014-08-12 04:00:00", "2014-08-13 04:00:00", "2014-08-14 04:00:00", "2014-08-15 04:00:00", "2014-08-16 04:00:00", "2014-08-18 04:00:00", "2014-08-19 04:00:0

In [88]:
shot_histo = [
    np.histogram(df["shots"], bins=[10, 50, 200, 500, 1000, 2000, 5000, 10000,]) for df in [ft1_df, ft2_df]
]

In [91]:
fig = go.FigureWidget()
fig.layout["xaxis"]["type"] = "log"
fig.layout["yaxis"]["type"] = "log"

for histo, name in zip(shot_histo, ["FT1", "FT2"]):
    fig.add_scatter(x=histo[1], y=histo[0], name=name)

fig

FigureWidget({
    'data': [{'name': 'FT1',
              'type': 'scatter',
              'uid': '2f219a75-09…

In [81]:
freq_histo = [
    np.histogram(df["cavity"], bins=np.linspace(7000., 40000., 100)) for df in [ft1_df, ft2_df]
]

In [97]:
fig = go.FigureWidget()

fig.layout["xaxis"]["tickformat"] = ".,"
fig.layout["xaxis"]["title"] = "Frequency (MHz)"
fig.layout["yaxis"]["title"] = "Counts"
fig.layout["title"] = "What are the most common frequencies?"

for histo, name in zip(freq_histo, ["FT1", "FT2"]):
    fig.add_bar(x=histo[1], y=histo[0], name=name)

fig

FigureWidget({
    'data': [{'name': 'FT1',
              'type': 'bar',
              'uid': '3768e0fc-7ec2-4…

In [100]:
print(plot(fig, show_link=False, link_text="", output_type="div", include_plotlyjs=False))

<div><div id="b01d543c-a117-41d1-943d-7fdd17531616" style="height: 600.0px; width: 100%;" class="plotly-graph-div"></div><script type="text/javascript">window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL="https://plot.ly";Plotly.newPlot("b01d543c-a117-41d1-943d-7fdd17531616", [{"name": "FT1", "x": ["2014-07-08 04:00:00", "2014-07-09 04:00:00", "2014-07-10 04:00:00", "2014-07-11 04:00:00", "2014-07-13 04:00:00", "2014-07-14 04:00:00", "2014-07-15 04:00:00", "2014-07-16 04:00:00", "2014-07-17 04:00:00", "2014-07-18 04:00:00", "2014-07-21 04:00:00", "2014-07-22 04:00:00", "2014-07-23 04:00:00", "2014-07-25 04:00:00", "2014-07-28 04:00:00", "2014-07-29 04:00:00", "2014-07-30 04:00:00", "2014-07-31 04:00:00", "2014-08-01 04:00:00", "2014-08-05 04:00:00", "2014-08-06 04:00:00", "2014-08-07 04:00:00", "2014-08-11 04:00:00", "2014-08-12 04:00:00", "2014-08-13 04:00:00", "2014-08-14 04:00:00", "2014-08-15 04:00:00", "2014-08-16 04:00:00", "2014-08-18 04:00:00", "2014-08-19 04:00:0