# create sin(x) function

- sin(x) -> float with 4 digit precision. Data = {(sin(a.bcde)=f.ghij\n)}_{i=1}^n:
- sample uniformly from [-pi/2, pi/2] -> truncate to 4 digits precision -> compute sin(x)
- x -> str -> truncate -> float -> sin(x) -> truncate

- Try single operand arithmetic operations: Say sin(x) for some fixed precision say 4 digits. (Or sqrt(x))


In [1]:
import math
import random
import os

In [2]:
def truncate_to_4_digit(x):
    return math.floor(x * 10000) / 10000

def truncate_to_n_digit(x, n=4):
    return math.floor(x * (10 ** n)) / (10 ** n)

In [3]:
x = random.uniform(-math.pi/2, math.pi/2)
x_trunc = truncate_to_4_digit(x)
print(x,x_trunc)
y = math.sin(x_trunc)
y_trunc = truncate_to_4_digit(y)
print(y,y_trunc)

0.6488797711964804 0.6488
0.604230669672893 0.6042


In [4]:
def create_sin_data(filename, total_num_examples=10000):
    with open(filename, 'w') as f:
        for i in range(total_num_examples):
            x = random.uniform(-math.pi/2, math.pi/2)
            x_trunc = truncate_to_4_digit(x)
            y = math.sin(x_trunc)
            y_trunc = truncate_to_4_digit(y)
            f.write(f'sin({x_trunc})={y_trunc}\n')

In [5]:
total_num_examples = 10000
input_file_path = f'train_sin_{total_num_examples}.txt'
if not os.path.exists(input_file_path):
    create_sin_data(input_file_path, total_num_examples=total_num_examples)
else:
    print(f'File {input_file_path} already exists')

File train_sin_10000.txt already exists


In [6]:
total_num_examples = 20000
input_file_path = f'train_sin_{total_num_examples}.txt'
if not os.path.exists(input_file_path):
    create_sin_data(input_file_path, total_num_examples=total_num_examples)
else:
    print(f'File {input_file_path} already exists')

total_num_examples = 40000
input_file_path = f'train_sin_{total_num_examples}.txt'
if not os.path.exists(input_file_path):
    create_sin_data(input_file_path, total_num_examples=total_num_examples)
else:
    print(f'File {input_file_path} already exists')

File train_sin_40000.txt already exists


# Create Non-overlapping test dataset


In [6]:
total_num_examples = 10000
input_file_path = f'train_sin_{total_num_examples}.txt'
output_file_path = f'train_sin_{total_num_examples}_nonoverlap.txt'

if not os.path.exists(output_file_path):
    lines_to_remove = set()
    with open(input_file_path, 'r') as f:
        for line in f.readlines():
            lines_to_remove.add(line)

    print(len(lines_to_remove))

    with open(output_file_path, 'w') as f:
        for x in range(int(truncate_to_4_digit(-math.pi/2)*10000), int(truncate_to_4_digit(math.pi/2)*10000)+1):
            x = x / 10000
            x_trunc = truncate_to_4_digit(x)
            y = math.sin(x_trunc)
            y_trunc = truncate_to_4_digit(y)
            line_to_add = f'sin({x_trunc})={y_trunc}\n'
            if line_to_add in lines_to_remove:
                lines_to_remove.remove(line_to_add)
            else:
                f.write(line_to_add)

    print(len(lines_to_remove))


In [7]:
# # shuffle and create a smaller version
# input_file_path = f'train_sin_{total_num_examples}_nonoverlap.txt'
# num_test_samples = 10000

# with open(input_file_path, 'r') as f:
#     lines = f.readlines()
#     random.shuffle(lines)
#     with open(f'test_sin_{num_test_samples}.txt', 'w') as f2:
#         for line in lines[:num_test_samples]:
#             f2.write(line)

In [8]:
# # shuffle and create a smaller version
# input_file_path = f'train_sin_{total_num_examples}_nonoverlap.txt'
# num_test_samples = 1000

# with open(input_file_path, 'r') as f:
#     lines = f.readlines()
#     random.shuffle(lines)
#     with open(f'test_sin_{num_test_samples}.txt', 'w') as f2:
#         for line in lines[:num_test_samples]:
#             f2.write(line)

In [9]:
# # shuffle and create a smaller version
# input_file_path = f'train_sin_{total_num_examples}_nonoverlap.txt'
# num_test_samples = 100

# with open(input_file_path, 'r') as f:
#     lines = f.readlines()
#     random.shuffle(lines)
#     with open(f'test_sin_{num_test_samples}.txt', 'w') as f2:
#         for line in lines[:num_test_samples]:
#             f2.write(line)

In [3]:
import random
import os
def get_subset_train_data(num_samples):
    with open('train_sin_10000.txt', 'r') as f:
        lines = f.readlines()
    random.shuffle(lines)
    if os.path.exists(f'train_sin_{num_samples}.txt'):
        print('file exists!')
        return
    
    with open(f'train_sin_{num_samples}.txt', 'w') as f:
        for line in lines[:num_samples]:
            f.write(line)

get_subset_train_data(1000)
get_subset_train_data(3000)
get_subset_train_data(5000)

# Let's make an algorithmic reasonic-like function (Taylor's Approx.) to calculate sine of a number

In [9]:
import math 
import random

def truncate_to_n_digit(x, n=4):
    return math.floor(x * (10 ** n)) / (10 ** n)


In [10]:
def get_input_string(x: float, operator='sin'):
    x_trunc = truncate_to_n_digit(x)
    input_str = f'Input:\n{operator}({x_trunc})\n'
    input_str += f'Target:\n'

    return input_str


def get_output_string(x,y=0, n=4):
    output_str = f'<scratch>\n'

    x_true = truncate_to_n_digit(x, 4)
    this_x = x_true

    output_str += f'x_0={this_x}\n'

    for i in range(1, n+1):
        k = 2*i+1

        x_i =this_x

        this_x = this_x + (-1) ** i * (x ** k) / (math.factorial(k))
        this_x = truncate_to_n_digit(this_x, n)

        plus_minus = '+ 1' if i % 2 == 0 else '- 1'

        output_str += f'x_{i}: x_{i-1} {plus_minus}/{k}! * (x^{k}) , x_{i}={this_x}'

        if not i == n:
            output_str += '\n'

    output_str += ' , END\n</scratch>\n'
    
    output_str += f'{this_x}\n'

    return output_str[:-1]+'\n'

In [11]:
a = math.pi / 2
print(get_input_string(a))
print(get_output_string(a))
print(math.sin(a))

Input:
sin(1.5707)
Target:

<scratch>
x_0=1.5707
x_1: x_0 - 1/3! * (x^3) , x_1=0.9247
x_2: x_1 + 1/5! * (x^5) , x_2=1.0043
x_3: x_2 - 1/7! * (x^7) , x_3=0.9996
x_4: x_3 + 1/9! * (x^9) , x_4=0.9997 , END
</scratch>
0.9997

1.0


In [13]:
max_error = 0
for i in range(1000000):
    x = random.uniform(-math.pi/2, math.pi/2)
    x_trunc = truncate_to_n_digit(x)
    y = math.sin(x_trunc)
    y_trunc = truncate_to_n_digit(y)
    taylor_y = get_output_string(x_trunc, n=4)
    taylor_y = float(taylor_y.split('\n')[-2])
    error = abs(y_trunc - taylor_y)
    
    if error > max_error:
        max_error = error
        print(f'x={x_trunc}, y={y_trunc}, taylor_y={taylor_y}, error={error}')

print(f'max error = {max_error}')

x=1.3192, y=0.9685, taylor_y=0.9683, error=0.00019999999999997797
x=1.4569, y=0.9935, taylor_y=0.9933, error=0.000200000000000089
x=-1.0947, y=-0.8888, taylor_y=-0.8891, error=0.00029999999999996696
x=0.5126, y=0.4904, taylor_y=0.4901, error=0.00030000000000002247
x=1.2779, y=0.9574, taylor_y=0.9571, error=0.000300000000000078
x=1.4606, y=0.9939, taylor_y=0.9935, error=0.00039999999999995595
x=-1.1906, y=-0.9286, taylor_y=-0.929, error=0.00040000000000006697
max error = 0.00040000000000006697


### To-Do:

1. make target for test data = what it's supposed to output (instead of the exact sin/sqrt value)

2. error criterion:
- don't penalize if it is between -> exact & modified
- compare the entire generated string with what it's supposed to output (entire or x1~x4)
- - (add new function called exact_modfied so that target in)

3. new error metric:
- Sum | (y-y_hat)/y|^2
- Sum | (y-y_hat)_j | <- digit-wise
- Sum |# digits incorrect / total number of digits|



In [33]:
import numpy as np

def get_num_digits(a: str):
    if a == '':
        return 0
    else:
        if '.' in a: # if a contains a decimal point
            return len(a) - 1
        else:
            return len(str(int(a)))
        

def get_error_metric(y, y_hat, metric_type='accuracy', eps=0.0):
    if metric_type == 'accuracy':
        if eps == 0:
            return np.abs(y == y_hat)
        else:
            return np.abs(y - y_hat) < eps

    if metric_type == 'mse':
        return (y - y_hat)**2

    elif metric_type == 'scaled_mse':
        return ((y - y_hat) / y)**2

    elif metric_type == 'digit_wise_difference':
        return np.sum(np.abs(y - y_hat) > eps) / get_num_digits(str(y))

    elif metric_type == 'incorrect_digit_count':
        #  count the number of digits that are incorrect
        count = 0
        y, y_hat = str(y), str(y_hat)
        y, y_hat = y.replace('.', ''), y_hat.replace('.', '')
        len1 = len(y)
        len2 = len(y_hat)
        for i in range(max(len1, len2)):
            if i > len1-1:
                y_i = 0
                y_hat_i = int(y_hat[i])
            elif i > len2-1:
                y_i = int(y[i])
                y_hat_i = 0
            else:
                y_i = int(y[i])
                y_hat_i = int(y_hat[i])
            if y_i != y_hat_i:
                count += 1
        return count


In [36]:
y = 0.9685
y_hat = 0.9684
eps = 0.00001
print(get_error_metric(y, y_hat, metric_type='accuracy', eps=eps))
print(get_error_metric(y, y_hat, metric_type='mse', eps=eps))
print(get_error_metric(y, y_hat, metric_type='digit_wise_difference', eps=eps))
print(get_error_metric(y, y_hat, metric_type='incorrect_digit_count', eps=eps))


False
9.999999999997797e-09
0.2
1
