In [54]:
import pandas as pd
import os
import numpy as np
from scipy.stats import gaussian_kde
from encoder.encoder import Encoder

## Loading Data

In [55]:
encoder = Encoder()
dataframes = []
directory = './data/sensor_data'

In [57]:
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        dataframes.append(df)

data = pd.concat(dataframes, ignore_index=True)

data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data.set_index('Timestamp', inplace=True)

kde = gaussian_kde(data[data.columns[0]], bw_method=.1)
x = np.arange(float(data.min()), float(data.max()) + 1)
pdf = kde(x)

pdf_data = pd.DataFrame({'x': x, 'pdf': pdf})
pdf_dict = pdf_data.set_index('x')['pdf'].to_dict()

In [59]:
def compress(data, code):
    return data.map(code)

def decompress(data, code):
    inverted_code = {v: k for k, v in self.code.items()}
    return data.astype(str).map(inverted_code).astype(int)


## Test on new data

In [60]:
huffman_code = encoder.get_huffman_code(pdf_dict)
shannon_code = encoder.get_shannon_code(pdf_dict)

huffman_compressed_series = compress(series, huffman_code)
shannon_compressed_series = compress(series, shannon_code)

In [61]:
# the number of bits used for storage before compression
np.ceil(np.log2(len(test_series.unique())))

8.0

In [62]:
huffman_compressed_series.map(lambda x : len(x)).mean()

5.432951388888889

In [63]:
shannon_compressed_series.map(lambda x : len(x)).mean()

5.975590277777778