# Naive approaches to forcasting as baseline

### Importing packages that'll be used

In [24]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import statistics
from sklearn.metrics import mean_squared_error

### Reading the data as series objects from the generated csv files

In [23]:
basedir = '../../../tlm-data/train/generated/'
data_files = []
with os.scandir(basedir) as dir:
    for item in dir:
        if item.is_file() and item.name[-4:] == '.csv':
            data_files.append(item.name)
            
print(*data_files, sep='\n')
print(f'\nThere are {len(data_files)} csv files with data')

Battery1_telemetryOutputData_2020_02_05_15_48_33.csv
Battery2_telemetryOutputData_2020_02_05_15_48_33.csv
CmdRcv_telemetryOutputData_2020_02_05_15_48_33.csv
RSSI1_telemetryOutputData_2020_02_05_15_48_33.csv
RSSI2_telemetryOutputData_2020_02_05_15_48_33.csv
TlmTx_telemetryOutputData_2020_02_05_15_48_33.csv

There are 6 csv files with data


In [4]:
def read_to_series():
    series = []
    for csv in data_files:
        s = pd.Series.from_csv(f'{basedir}{csv}')
        s.name = 'Values'
        series.append(s)
        print(csv)
        print(s.head())
        print()
    return series   

In [5]:
series = read_to_series()

  infer_datetime_format=infer_datetime_format)


Battery1_telemetryOutputData_2020_02_05_15_48_33.csv
2020-02-05 07:48:33.520994    12.5
2020-02-05 07:48:33.622164    11.2
2020-02-05 07:48:33.722767    11.3
2020-02-05 07:48:33.823252    11.7
2020-02-05 07:48:33.923965    11.1
Name: Values, dtype: float64

Battery2_telemetryOutputData_2020_02_05_15_48_33.csv
2020-02-05 07:48:33.540301    14.9
2020-02-05 07:48:33.641321    15.8
2020-02-05 07:48:33.742324    16.1
2020-02-05 07:48:33.843091    10.6
2020-02-05 07:48:33.943734    10.3
Name: Values, dtype: float64

CmdRcv_telemetryOutputData_2020_02_05_15_48_33.csv
2020-02-05 07:48:33.563606    10181.8
2020-02-05 07:48:33.665165    10402.4
2020-02-05 07:48:33.766077     9623.3
2020-02-05 07:48:33.867295     9602.8
2020-02-05 07:48:33.968509    10170.6
Name: Values, dtype: float64

RSSI1_telemetryOutputData_2020_02_05_15_48_33.csv
2020-02-05 07:48:33.506520   -101.7
2020-02-05 07:48:33.607553   -109.9
2020-02-05 07:48:33.708445   -106.7
2020-02-05 07:48:33.809109   -100.7
2020-02-05 07:48:33

In [6]:
def to_data_frames():
    frames = []
    for ser in series:
        frames.append( ser.to_frame() )
    return frames

In [7]:
# dfs is shorthand for data frames
dfs = to_data_frames()

### Very Naive forcast: Using previous known value to predict the next

In [29]:
def prev_predict(series):
    predictions = ['-'] # first value is will be blank
    test = []
    for i in range(1, len(series)):
        pred = series[i-1]
        actual = series[i]
        test.append(actual)
        predictions.append(pred)
    mse = mean_squared_error(test, predictions[1:])
    rmse = np.sqrt(mse)
    print('RMSE is', rmse)
    return predictions

In [30]:
for i in range(len(series)):
    preds = prev_predict(series[i])
    # dfs[i]['Prev'] = preds
    print()

RMSE is 0.815585629315753

RMSE is 4.089972096965123

RMSE is 408.43440757916613

RMSE is 4.098023145829881

RMSE is 12.186031415980375

RMSE is 56.88848604724451



### Moving Average Forecast

In [33]:
def moving_avg(ser, window):
    errors = []
    predictions = []
    test = []
    for i in range(window, len(ser)):
        vals = ser[i-window:i]
        avg = statistics.mean(vals)
        pred, actual = avg, ser[i]
        test.append(actual)
        predictions.append(pred)
    
    mse = mean_squared_error(test, predictions)
    rmse = np.sqrt(mse)
    print('RMSE is', rmse)
    
    # adding missing values
    non_values = [ '-' for x in range(window) ]
    predictions = non_values + predictions
    return predictions

In [34]:
wins = 10, 100, 1000
for i in range(len(series)):
    print()
    for window in wins:
        print(f'Window size {window}')
        preds = moving_avg(series[i], window)
        dfs[i][f'{window}th Moving Avg'] = preds
        print()
    print('- - - - - ')


Window size 10
RMSE is 0.6076582307122297

Window size 100
RMSE is 0.5825496121708135

Window size 1000
RMSE is 0.5790529646233851

- - - - - 

Window size 10
RMSE is 3.033688419577278

Window size 100
RMSE is 2.9073417423431116

Window size 1000
RMSE is 2.9028782757578115

- - - - - 

Window size 10
RMSE is 306.19090908425164

Window size 100
RMSE is 292.57754290089855

Window size 1000
RMSE is 290.5012468140144

- - - - - 

Window size 10
RMSE is 3.0382026605775114

Window size 100
RMSE is 2.9082973120663556

Window size 1000
RMSE is 2.898378235411537

- - - - - 

Window size 10
RMSE is 9.000172926707206

Window size 100
RMSE is 8.66440622337787

Window size 1000
RMSE is 8.608204753291936

- - - - - 

Window size 10
RMSE is 42.24691242188052

Window size 100
RMSE is 40.63617621033443

Window size 1000
RMSE is 40.44345714321188

- - - - - 


In [12]:
dfs[0][0:2000]

Unnamed: 0,Values,10th Moving Avg,100th Moving Avg,1000th Moving Avg
2020-02-05 07:48:33.520994,12.5,-,-,-
2020-02-05 07:48:33.622164,11.2,-,-,-
2020-02-05 07:48:33.722767,11.3,-,-,-
2020-02-05 07:48:33.823252,11.7,-,-,-
2020-02-05 07:48:33.923965,11.1,-,-,-
2020-02-05 07:48:34.024591,11.1,-,-,-
2020-02-05 07:48:34.125257,11.5,-,-,-
2020-02-05 07:48:34.226037,12.0,-,-,-
2020-02-05 07:48:34.326727,11.5,-,-,-
2020-02-05 07:48:34.427272,11.4,-,-,-


In [16]:
def save_engineered_csvs():
    for i in range(len(data_files)):
        dfs[i].to_csv(f'{basedir}engineered/{data_files[i]}')
        print(f'Wrote {len(dfs[i])} lines to generated/engineered/{data_files[i]}')

In [17]:
save_engineered_csvs()

Wrote 6893 lines to generated/engineered/Battery1_telemetryOutputData_2020_02_05_15_48_33.csv
Wrote 6894 lines to generated/engineered/Battery2_telemetryOutputData_2020_02_05_15_48_33.csv
Wrote 6893 lines to generated/engineered/CmdRcv_telemetryOutputData_2020_02_05_15_48_33.csv
Wrote 6894 lines to generated/engineered/RSSI1_telemetryOutputData_2020_02_05_15_48_33.csv
Wrote 6894 lines to generated/engineered/RSSI2_telemetryOutputData_2020_02_05_15_48_33.csv
Wrote 6893 lines to generated/engineered/TlmTx_telemetryOutputData_2020_02_05_15_48_33.csv
