# Analiza technologii `sklearn`

## Przygotowanie środowiska

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

## Dane

### Wczytanie danych o zdarzeniach

In [None]:
events = pd.read_csv('data.csv', names=['time', 'trigger', 'event'])

### Przygotowanie danych do predykcji (tzw. *feature engineering*)

#### Projekcja listy zdarzeń na pary następujących po sobie zdarzeń

In [None]:
data = pd.DataFrame()
data['timestamp'] = events['time']
data['first'] = events['trigger'] + '_' + events['event']
data['second'] = data['first'].shift(-1)
data = data[:-1]

#### Zamiana zdarzeń na ich hash



In [None]:
data['first_#'] = data['first'].map(lambda e: hash(e))
data['second_#'] = data['second'].map(lambda e: hash(e))

#### Dodanie informacji o porze dnia

In [None]:
from datetime import datetime

def timestamp_to_seconds_of_day(timestamp: int) -> int:
  t = datetime.fromtimestamp(timestamp)
  midnight = t.replace(hour=0, minute=0, second=0, microsecond=0)
  return (t - midnight).seconds

def timestamp_to_part_of_day(timestamp: int) -> int:
  seconds = timestamp_to_seconds_of_day(timestamp)
  part = int(seconds / 60 / 60 / 6) + 1
  return f'part_of_day_{part}'

data['part_of_day'] = data['timestamp'].map(timestamp_to_part_of_day)
data = data.join(
  pd.get_dummies(data['part_of_day'])
)

### Końcowa postać danych

In [None]:
data.head(3)

Unnamed: 0,timestamp,first,second,first_#,second_#,part_of_day,part_of_day_1,part_of_day_2,part_of_day_3,part_of_day_4
0,1589144709,outdoor_gate_1_switch_gate_open,kitchen_light_1_switch_light_on,3288388533445299961,-1955586423453594671,part_of_day_4,0,0,0,1
1,1589144709,kitchen_light_1_switch_light_on,bedroom_2_light_1_switch_light_on,-1955586423453594671,2920417917512579431,part_of_day_4,0,0,0,1
2,1589144709,bedroom_2_light_1_switch_light_on,outdoor_gate_1_switch_gate_close,2920417917512579431,2908511941534193630,part_of_day_4,0,0,0,1


## Predykcje

In [None]:
FEATURES = ['first_#', 'part_of_day_1', 'part_of_day_2', 'part_of_day_3', 'part_of_day_4']
unique_events = data[FEATURES].drop_duplicates().reset_index(drop=True)

In [None]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(data[FEATURES], data['second_#']);

In [None]:
 probability = rf.predict_proba(unique_events)

## Interpretacja predykcji

### Połączenie prawdopodobieństwa ze zdarzniami

In [None]:
results = pd.DataFrame(columns=['probability', 'second_#'])
for n in range(len(probability)):
  prob_with_hash = np.column_stack((probability[n], [str(c) for c in rf.classes_]))
  df = pd.DataFrame(prob_with_hash, columns=['probability', 'second_#'])
  df['first_#'] = str(unique_events['first_#'][n])
  df['part_of_day_1'] = str(unique_events['part_of_day_1'][n])
  df['part_of_day_2'] = str(unique_events['part_of_day_2'][n])
  df['part_of_day_3'] = str(unique_events['part_of_day_3'][n])
  df['part_of_day_4'] = str(unique_events['part_of_day_4'][n])
  results = results.append(df, ignore_index=True).reset_index(drop=True)

In [None]:
results = results.sort_values(by='probability', ascending=False)

### Wybór kandydatów na początki **sekwencji**

In [None]:
top = results[results['probability'].map(float) > 0.65]
top

Unnamed: 0,probability,second_#,first_#,part_of_day_1,part_of_day_2,part_of_day_3,part_of_day_4
12622,0.78,7663073136196165317,-7180317460546426553,0,0,0,1
12981,0.75,-8965402975733170289,7219561513345988798,0,0,1,0
12765,0.6639712531327452,-2460439324685945059,7219561513345988798,0,1,0,0


### Konstrukcja sekwencji

In [None]:
routines = []

def next_event(routine: [], part_of_day: int) -> []:
  last = routine[-1]
  candidates = results[results['first_#'] == last]
  candidate = candidates[candidates[f'part_of_day_{part_of_day}'] == '1'].iloc[0]
  if float(candidate['probability']) < 0.25 or candidate['second_#'] in routine:
    return routine
  else:
    routine.append(candidate['second_#'])
    return next_event(routine, part_of_day)


for _, second, first, d1, d2, d3, d4  in map(tuple, top.to_numpy()):  
  if d1 == '1':
    part_of_day = 1
  elif d2 == '1':
    part_of_day = 2
  elif d3 == '1':
    part_of_day = 3
  elif d4 == '1':
    part_of_day = 4

  routine = next_event([first, second], part_of_day)
  routines.append((routine, part_of_day))


In [None]:
routines

[(['-7180317460546426553', '7663073136196165317'], 4),
 (['7219561513345988798', '-8965402975733170289'], 3),
 (['7219561513345988798', '-2460439324685945059'], 2)]

### Tłumaczenie sekwencji

In [None]:
def event_hash_to_name(routine, part_of_day):
   parsed = map(lambda e: data[data['first_#'] == int(e)]['first'].iloc[0], routine)
   return list(parsed), part_of_day

results = list(map(lambda r: event_hash_to_name(*r), routines))

In [None]:
for i, result in enumerate(results, start=1): 
  routine, part_of_day = result
  event_list = "\n\t\t".join(routine)
  print(f'Routine {i}: ')
  print(f'    Part of the day: {part_of_day}')
  print(f'    Events:{event_list}')
  print()

Routine 1: 
    Part of the day: 4
    Events:living_room_blind_3_switch_blind_down
		bathroom_light_2_switch_light_off

Routine 2: 
    Part of the day: 3
    Events:living_room_blind_2_switch_blind_down
		kitchen_blind_1_switch_blind_down

Routine 3: 
    Part of the day: 2
    Events:living_room_blind_2_switch_blind_down
		general_ac_temp_up

