# Univariate Methods

In [95]:
import pandas as pd
import os
import csv
import io
import numpy as np

In [96]:
segment_id = 0

filename = f'temperature_degree_c/{segment_id}_temperature_degree_c.csv'

df = pd.read_csv(filename, parse_dates=['timestamp_utc'], index_col='timestamp_utc', date_format="%m/%d/%Y %I:%M:%S %p")
# todo: figure out why the data is always sorted
df = df.sort_index()

In [97]:
def test_methods(segment_id, percentage=10):

    segment_id = 0

    filename = f'temperature_degree_c/{segment_id}_temperature_degree_c.csv'

    df = pd.read_csv(filename, parse_dates=['timestamp_utc'], index_col='timestamp_utc', date_format="%m/%d/%Y %I:%M:%S %p")
    # todo: figure out why the data is always sorted
    df = df.sort_index()


    num_rows = len(df)
    num_missing = int(num_rows * percentage / 100)
    
    # Ensure at least one row is set as missing
    num_missing = max(1, num_missing)
    
    # Randomly select a starting index for the contiguous block
    start_index = np.random.randint(0, num_rows - num_missing + 1)

    df['missing'] = df.iloc[:,0]
    # Set the contiguous block of rows as missing (NaN)
    df.iloc[start_index:start_index + num_missing, df.columns.get_loc('missing')] = np.nan

    results = {}
    
    df['filled'] = df['missing'].ffill()
    
    df['error'] = df['temperature_degree_c'] - df['filled']
    MAE = np.mean(abs(df['error']))
    RMSE = np.sqrt(np.mean((df['error'])**2))
    
    results['Last Observation Carried Forward'] = {'MAE': MAE, 'RMSE': RMSE}
    
    df['filled'] = df['missing'].fillna(df['missing'].mean())
    
    df['error'] = df['temperature_degree_c'] - df['filled']
    MAE = np.mean(abs(df['error']))
    RMSE = np.sqrt(np.mean((df['error'])**2))
    results['Mean Value'] = {'MAE': MAE, 'RMSE': RMSE}
    
    df['filled'] = df['missing'].interpolate(method='linear')
    
    df['error'] = df['temperature_degree_c'] - df['filled']
    MAE = np.mean(abs(df['error']))
    RMSE = np.sqrt(np.mean((df['error'])**2))
    results['Linear Interpolation'] = {'MAE': MAE, 'RMSE': RMSE}
    
    df['filled'] = df['missing'].interpolate(method='nearest')
    
    df['error'] = df['temperature_degree_c'] - df['filled']
    MAE = np.mean(abs(df['error']))
    RMSE = np.sqrt(np.mean((df['error'])**2))
    
    results['Nearest Neighbour'] = {'MAE': MAE, 'RMSE': RMSE}
    
    df['filled'] = df['missing'].interpolate(method='polynomial', order=2)
    
    df['error'] = df['temperature_degree_c'] - df['filled']
    MAE = np.mean(abs(df['error']))
    RMSE = np.sqrt(np.mean((df['error'])**2))
    
    results['Polynomial Interpolation (K=2)'] = {'MAE': MAE, 'RMSE': RMSE}
    
    df['filled'] = df['missing'].interpolate(method='polynomial', order=3)
    
    df['error'] = df['temperature_degree_c'] - df['filled']
    MAE = np.mean(abs(df['error']))
    RMSE = np.sqrt(np.mean((df['error'])**2))
    
    results['Polynomial Interpolation (K=3)'] = {'MAE': MAE, 'RMSE': RMSE}
    
    df['filled'] = df['missing'].interpolate(method='spline', order=2)
    
    df['error'] = df['temperature_degree_c'] - df['filled']
    MAE = np.mean(abs(df['error']))
    RMSE = np.sqrt(np.mean((df['error'])**2))
    
    results['Spline Interpolation (K=2)'] = {'MAE': MAE, 'RMSE': RMSE}
    
    df['filled'] = df['missing'].interpolate(method='spline', order=3)
    
    df['error'] = df['temperature_degree_c'] - df['filled']
    MAE = np.mean(abs(df['error']))
    RMSE = np.sqrt(np.mean((df['error'])**2))
    
    results['Spline Interpolation (K=3)'] = {'MAE': MAE, 'RMSE': RMSE}

    return results

## Results

In [99]:
segment_id = 5

rows = []
for i in range(20):
    results = test_methods(segment_id, 5)

    for method in results:
        MAE = results[method]['MAE']
        RMSE = results[method]['RMSE']

        row = (method, MAE, RMSE)
        rows.append(row)

results = pd.DataFrame(rows, columns = ['method', 'MAE', 'RMSE'])
results.groupby('method').mean().sort_values(by='MAE') 

Unnamed: 0_level_0,MAE,RMSE
method,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Interpolation,0.025199,0.137636
Spline Interpolation (K=3),0.026429,0.143491
Nearest Neighbour,0.031235,0.169819
Last Observation Carried Forward,0.039064,0.20471
Spline Interpolation (K=2),0.055169,0.270341
Mean Value,0.230722,1.045981
Polynomial Interpolation (K=2),0.563382,2.963296
Polynomial Interpolation (K=3),0.606044,3.1024


In [93]:
rows = []
for i in range(20):
    results = test_methods(segment_id, 10)

    for method in results:
        MAE = results[method]['MAE']
        RMSE = results[method]['RMSE']

        row = (method, MAE, RMSE)
        rows.append(row)

results = pd.DataFrame(rows, columns = ['method', 'MAE', 'RMSE'])
results.groupby('method').mean().sort_values(by='MAE') 

Unnamed: 0_level_0,MAE,RMSE
method,Unnamed: 1_level_1,Unnamed: 2_level_1
Spline Interpolation (K=3),0.058924,0.223322
Linear Interpolation,0.060086,0.234232
Nearest Neighbour,0.080904,0.31532
Spline Interpolation (K=2),0.095685,0.345974
Last Observation Carried Forward,0.106581,0.398596
Mean Value,0.323719,1.066052
Polynomial Interpolation (K=2),2.448047,9.021365
Polynomial Interpolation (K=3),2.572125,9.331882


In [94]:
rows = []
for i in range(20):
    results = test_methods(segment_id, 20)

    for method in results:
        MAE = results[method]['MAE']
        RMSE = results[method]['RMSE']

        row = (method, MAE, RMSE)
        rows.append(row)

results = pd.DataFrame(rows, columns = ['method', 'MAE', 'RMSE'])
results.groupby('method').mean().sort_values(by='MAE') 

Unnamed: 0_level_0,MAE,RMSE
method,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Interpolation,0.131887,0.37086
Spline Interpolation (K=3),0.134684,0.371527
Nearest Neighbour,0.223175,0.593556
Spline Interpolation (K=2),0.278772,0.700308
Last Observation Carried Forward,0.333003,0.859859
Mean Value,0.969908,2.238259
Polynomial Interpolation (K=2),6.974102,17.98635
Polynomial Interpolation (K=3),7.311506,18.573871
