# Setup

In [1]:
!pip install plotly scikit-learn imbalanced-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib inline

In [4]:
!jupyter labextension install jupyterlab-plotly

[33m(Deprecated) Installing extensions with the jupyter labextension install command is now deprecated and will be removed in a future major version of JupyterLab.

Users should manage prebuilt extensions with package managers like pip and conda, and extension authors are encouraged to distribute their extensions as prebuilt packages [0m


In [5]:
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from ipywidgets import interactive, HBox, VBox, Dropdown, Button, Output
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from IPython.display import display, clear_output

In [6]:
# Initialize Plotly for Jupyter
py.init_notebook_mode(connected=True)

# Initialize Spark Session
spark = SparkSession.builder.appName("InteractiveHeatmapAnalysis").getOrCreate()

24/07/16 10:04:47 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Import data

In [22]:
# Get the current timestamp and calculate the timestamp for N minutes ago
current_time = datetime.now()
delta = timedelta(minutes=15)
t_min = current_time - delta
t_min_formatted = t_min.strftime('%Y-%m-%d %H:%M:%S')

# Query the last five minutes of data
query = f"""
SELECT timestamp, sensor_name, distance, value, measurement
FROM demo.silver.dfo
WHERE timestamp >= '{t_min_formatted}' AND measurement = 'RMS0'
ORDER BY timestamp, sensor_name
"""

df = spark.sql(query)

In [23]:
# Convert to Pandas DataFrame
pdf = df.toPandas()

# Ensure correct data types
pdf['timestamp'] = pd.to_datetime(pdf['timestamp'])
pdf['distance'] = pd.to_numeric(pdf['distance'])
pdf['value'] = pd.to_numeric(pdf['value'])

# Group by timestamp and distance, and aggregate the values
grouped_df = pdf.groupby(['timestamp', 'distance'])['value'].mean().reset_index()

# Create a pivot table from the grouped data
pivot_df = grouped_df.pivot(index='distance', columns='timestamp', values='value')
pivot_df = pivot_df.fillna(0)

# Label data

In [24]:
print("Instructions:")
print("1. Use the box select tool to select areas on the heatmap.")
print("2. The coordinates of your selections will appear in the table below the heatmap.")
print("3. You can make multiple selections, and all will be shown in the table.")

# Create a list of full datetime objects for x-axis
x_datetimes = pivot_df.columns.tolist()
x_labels = [t.strftime('%Y-%m-%d %H:%M:%S') for t in x_datetimes]

# Function to map index to full datetime
def index_to_datetime(index):
    index = max(0, min(int(index), len(x_datetimes) - 1))
    return x_datetimes[index]

heatmap = go.Heatmap(
    z=pivot_df.values,
    x=[t.strftime('%H:%M:%S') for t in pivot_df.columns],
    y=pivot_df.index,
    colorscale='Viridis'
)

# Create the main figure
f = go.FigureWidget(data=[
    go.Scatter(y=[None]),
    heatmap
], layout=go.Layout(dragmode='select'))
f.update_layout(
    title=f"Sensor Measurements Heatmap (Last {delta.total_seconds()/60} Minutes)",
    xaxis_title='Timestamp',
    yaxis_title='Distance',
    height=600,
    width=1000,
    dragmode='select',
    yaxis_autorange="reversed"
)

# Create a table to display selected coordinates
t = go.FigureWidget([go.Table(
    header=dict(values=['Selection', 'Start Time', 'End Time', 'Start Distance', 'End Distance'],
                fill=dict(color='#C2D4FF'),
                align=['left'] * 5),
    cells=dict(values=[[], [], [], [], []],
               fill=dict(color='#F5F8FF'),
               align=['left'] * 5))])

out = Output()

# List to store all selections
selections = []

# Function to handle selection
@out.capture(clear_output=True)
def selection_fn(trace, points, selector):
    x_start = index_to_datetime(selector.xrange[0])
    x_end = index_to_datetime(selector.xrange[1])
    y_start = selector.yrange[0]
    y_end = selector.yrange[1]
    
    selection_num = len(selections) + 1
    selections.append([f"Selection {selection_num}", 
                       x_start.strftime('%Y-%m-%d %H:%M:%S'), 
                       x_end.strftime('%Y-%m-%d %H:%M:%S'), 
                       y_start, 
                       y_end])
    
    # Update the table with all selections
    t.data[0].cells.values = [
        [s[0] for s in selections],  # Selection number
        [s[1] for s in selections],  # Start Time
        [s[2] for s in selections],  # End Time
        [s[3] for s in selections],  # Start Distance
        [s[4] for s in selections]   # End Distance
    ]
    
    print(f"Selection {selection_num} made: Time [{x_start.strftime('%Y-%m-%d %H:%M:%S')} to {x_end.strftime('%Y-%m-%d %H:%M:%S')}], Distance [{y_start:.2f} to {y_end:.2f}]")



# Connect the selection function to the heatmap
f.data[1].on_selection(selection_fn)

# Display the heatmap and the table
display(VBox([f, t, out]))

Instructions:
1. Use the box select tool to select areas on the heatmap.
2. The coordinates of your selections will appear in the table below the heatmap.
3. You can make multiple selections, and all will be shown in the table.


VBox(children=(FigureWidget({
    'data': [{'type': 'scatter', 'uid': '60477414-d300-495a-b6f1-12c410343e62', …

# Train ML model

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import numpy as np

In [26]:
# Step 1: Create a new column called 'label' based on the selections
def is_inside_selection(row, selections):
    for selection in selections:
        start_time = datetime.strptime(selection[1], '%Y-%m-%d %H:%M:%S')
        end_time = datetime.strptime(selection[2], '%Y-%m-%d %H:%M:%S')
        start_distance = selection[3]
        end_distance = selection[4]
        
        if (start_time <= row['timestamp'] <= end_time) and (start_distance <= row['distance'] <= end_distance):
            return 1
    return 0

pdf['label'] = pdf.apply(lambda row: is_inside_selection(row, selections), axis=1)

# Step 2: Prepare features (measurement and value) for the Random Forest model
# Encode the 'measurement' column
le = LabelEncoder()
pdf['measurement_encoded'] = le.fit_transform(pdf['measurement'])

X = pdf[['measurement_encoded', 'value']].values
y = pdf['label'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [27]:
# Train the Random Forest Classifier on the balanced data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_balanced, y_train_balanced)

In [28]:
# Step 3: Generate predictions
pdf['prediction'] = clf.predict(pdf[['measurement_encoded', 'value']])

# Create a pivot table for the predictions
pred_pivot = pdf.pivot(index='distance', columns='timestamp', values='prediction')


X has feature names, but RandomForestClassifier was fitted without feature names



In [29]:
# Create a new heatmap for the predictions
pred_heatmap = go.Heatmap(
    z=pred_pivot.values,
    x=[t.strftime('%H:%M:%S') for t in pred_pivot.columns],
    y=pred_pivot.index,
    colorscale='Viridis'
)

# Create a new figure for the prediction heatmap
pred_fig = go.FigureWidget(data=[pred_heatmap])
pred_fig.update_layout(
    title='Model Predictions Heatmap',
    xaxis_title='Timestamp',
    yaxis_title='Distance',
    height=600,
    width=1000,
    yaxis_autorange="reversed"
)

# Display the prediction heatmap
display(pred_fig)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, clf.predict(X_test)))

# Print feature importance
feature_importance = clf.feature_importances_
print("\nFeature Importance:")
print(f"Measurement: {feature_importance[0]}")
print(f"Value: {feature_importance[1]}")

print("\nInstructions:")
print("1. The heatmap above shows the model's predictions based on measurement and sensor values.")
print("2. Areas predicted as inside a selection are shown in yellow/green.")
print("3. Areas predicted as outside selections are shown in dark blue.")
print("4. The classification report above provides metrics on the model's performance.")
print("5. The feature importance shows how much the model relies on each feature for predictions.")

FigureWidget({
    'data': [{'colorscale': [[0.0, '#440154'], [0.1111111111111111, '#482878'],
                             [0.2222222222222222, '#3e4989'], [0.3333333333333333,
                             '#31688e'], [0.4444444444444444, '#26828e'],
                             [0.5555555555555556, '#1f9e89'], [0.6666666666666666,
                             '#35b779'], [0.7777777777777778, '#6ece58'],
                             [0.8888888888888888, '#b5de2b'], [1.0, '#fde725']],
              'type': 'heatmap',
              'uid': 'a4133acd-0a10-4a25-bbcb-a4f49b1c6ecc',
              'x': [09:53:01, 09:53:02, 09:53:03, ..., 10:07:57, 10:07:58,
                    10:07:59],
              'y': array([  0,   1,   2, ..., 393, 394, 395]),
              'z': array([[0, 0, 0, ..., 0, 0, 0],
                          [0, 0, 0, ..., 0, 0, 0],
                          [0, 0, 0, ..., 0, 0, 0],
                          ...,
                          [0, 0, 0, ..., 0, 0, 0],
            

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95     70115
           1       0.12      0.88      0.21      1086

    accuracy                           0.90     71201
   macro avg       0.56      0.89      0.58     71201
weighted avg       0.98      0.90      0.94     71201


Feature Importance:
Measurement: 0.0
Value: 1.0

Instructions:
1. The heatmap above shows the model's predictions based on measurement and sensor values.
2. Areas predicted as inside a selection are shown in yellow/green.
3. Areas predicted as outside selections are shown in dark blue.
4. The classification report above provides metrics on the model's performance.
5. The feature importance shows how much the model relies on each feature for predictions.
