# Lossy Decimal Compression Algorithm

In [1]:
import numpy as np 
import pandas as pd
import os
import time

def round_decimals(filename, desired_decimals):
    
    start = time.time()
    
    file = pd.read_csv(filename)
    file_size_before = os.stat(filename).st_size
    
    numeric = ["float64", "float32", "float16"]
    
    if desired_decimals < 0:
        return print("Negative decimal amount is not possible")
    if desired_decimals == 0: 
        for colname, coltype in file.dtypes.iteritems():
            if coltype in numeric:
                file[[colname]] = file[[colname]].fillna(0.0).applymap(np.int64)
                file[colname] = np.round(file[colname], desired_decimals)
                print(f"Column '{colname}' converted to {desired_decimals} decimals")
    else:
        for colname, coltype in file.dtypes.iteritems():
            if coltype in numeric:
                file[colname] = np.round(file[colname], desired_decimals)
                print(f"Column '{colname}' converted to {desired_decimals} decimals")
    
    file.to_csv("decimal_reduced_dataset.csv")
    file_size_after = os.stat("decimal_reduced_dataset.csv").st_size
    
    compression_ratio = file_size_after/file_size_before
    compression_factor = file_size_before/file_size_after
    saving_percentage = ((file_size_before - file_size_after)/file_size_before) * 100
    
    print()
    print("Your file is now compressed and stored as decimal_reduced_dataset.csv in the same file path as this notebook.")
    print()
    print(f" File storage usage decreased from"
              f" {file_size_before:.2f} Bytes to {file_size_after:.2f} Bytes")
    print()
    print(f" Compression ratio:" f" {compression_ratio:.2f}")
    print(f" Compression factor:"
              f" {compression_factor:.2f}")
    print(f" Saving percentage:"
              f" {saving_percentage:.2f}%")
    
    end = time.time()
    print(f" Seconds the algorithm took to run:" f"{end-start:.2f}")
    final = end-start

    return 

# Linear Model Algorithm

In [None]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression

def linear_model(filename, max_decimals):
    
    start = time.time()
    
    csvstorage = [] 
    decimals = []
    float_lst = ["float64", "float32", "float16"]
    
    if max_decimals < 0:
        return print("Negative decimal amount is not possible")
    if max_decimals == 1 or max_decimals == 0: 
        return print("Decimal amount should at least be >= 2 for a linear model")
    for i in range(1,max_decimals + 1): 
        file = pd.read_csv(filename)
        if i == 0:
            for colname, coltype in file.dtypes.iteritems():
                if coltype in float_lst:
                    file[[colname]] = file[[colname]].fillna(0.0).applymap(np.int64)
                    file[colname] = np.round(file[colname], i)
            file.to_csv("linear_model0.csv")
            size = os.stat("linear_model0.csv").st_size
            csvstorage.append(size)
            decimals.append(i)
        else: 
            for colname, coltype in file.dtypes.iteritems():
                if coltype in float_lst:
                    file[colname] = np.round(file[colname], i)
            file.to_csv("linear_model.csv")
            size = os.stat("linear_model.csv").st_size
            csvstorage.append(size)
            decimals.append(i)
    
    df = pd.DataFrame({"decimal_amount": decimals,'csv_storage_size': csvstorage})
    
    X = df.decimal_amount.values.reshape((-1, 1))
    y = df.csv_storage_size.values 
    
    linear_model = LinearRegression()
    linear_model.fit(X, y)
    
    score = linear_model.score(X, y)
    
    print(f"Linear model for CSV storage in bytes of dataset '{filename}' is: y = {linear_model.intercept_:.2f} + {float(linear_model.coef_):.2f}x")
    print(f"Coefficient of determination: '{score}' ")
    
    plt.scatter(X, y,  color='black')
    plt.plot(X, linear_model.predict(X), color='blue', linewidth=1)
    plt.title("Linear regression decimals and CSV file storage")
    plt.ylabel('CSV storage space in bytes') 
    plt.xlabel('Decimals') 
    plt.show()
    
    end = time.time()
    print(f" Seconds the algorithm took to run:" f"{end-start:.2f}")
    final = end-start

    return  

# Algorithm Timer

In [None]:
lst = []
mc = 100
for i in range(mc): 
    round_decimals("yourcsv.csv",2) #replace with algorithm, CSV file and decimal number of choice.
    lst.append(final)
average = sum(lst)/len(lst)

In [None]:
average